1 /* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q. 2 3 Compute Q = floor(N / D) + e. N is nn limbs, D is dn limbs and must be 4 normalized, and Q must be nn-dn limbs, 0 <= e <= 4. The requirement that Q 5 is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us 6 to let N be unmodified during the operation. 7 8 Contributed to the GNU project by Torbjorn Granlund. 9 10 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS 11 ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS 12 ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP 13 RELEASE. 14 15 Copyright 2005, 2006, 2007 Free Software Foundation, Inc. 16 17 This file is part of the GNU MP Library. 18 19 The GNU MP Library is free software; you can redistribute it and/or modify 20 it under the terms of the GNU Lesser General Public License as published by 21 the Free Software Foundation; either version 3 of the License, or (at your 22 option) any later version. 23 24 The GNU MP Library is distributed in the hope that it will be useful, but 25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 27 License for more details. 28 29 You should have received a copy of the GNU Lesser General Public License 30 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 31 32 33 /* We use the "misunderstanding algorithm" (MUA), discovered by Paul Zimmermann 34 and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of 35 Jebelean's bidirectional exact division algorithm. 36 37 The idea of this algorithm is to compute a smaller inverted value than used 38 in the standard Barrett algorithm, and thus save time in the Newton 39 iterations, and pay just a small price when using the inverted value for 40 developing quotient bits. 41 42 Written by Torbjorn Granlund. Paul Zimmermann suggested the use of the 43 "wrap around" trick. Based on the GMP divexact code and inspired by code 44 contributed to GMP by Karl Hasselstroem. 45 */ 46 47 48 /* CAUTION: This code and the code in mu_div_qr.c should be edited in lockstep. 49 50 Things to work on: 51 52 * Passing k isn't a great interface. Either 'in' should be passed, or 53 determined by the code. 54 55 * The current mpn_mu_div_qr_itch isn't exactly scientifically written. 56 Scratch space buffer overruns are not unlikely before some analysis is 57 applied. Since scratch requirements are expected to change, such an 58 analysis will have to wait til things settle. 59 60 * This isn't optimal when the remainder isn't needed, since the final 61 multiplication could be made special and take O(1) time on average, in that 62 case. This is particularly bad when qn << dn. At some level, code as in 63 GMP 4 mpn_tdiv_qr should be used, effectively dividing the leading 2qn 64 dividend limbs by the qn divisor limbs. 65 66 * This isn't optimal when the quotient isn't needed, as it might take a lot 67 of space. The computation is always needed, though, so there is not time 68 to save with special code. 69 70 * The itch/scratch scheme isn't perhaps such a good idea as it once seemed, 71 demonstrated by the fact that the mpn_inv function's scratch needs means 72 that we need to keep a large allocation long after it is needed. Things 73 are worse as mpn_mul_fft does not accept any scratch parameter, which means 74 we'll have a large memory hole while in mpn_mul_fft. In general, a peak 75 scratch need in the beginning of a function isn't well-handled by the 76 itch/scratch scheme. 77 78 * Some ideas from comments in divexact.c apply to this code too. 79 */ 80 81 /* the NOSTAT stuff handles properly the case where files are concatenated */ 82 #ifdef NOSTAT 83 #undef STAT 84 #endif 85 86 #ifdef STAT 87 #undef STAT 88 #define STAT(x) x 89 #else 90 #define NOSTAT 91 #define STAT(x) 92 #endif 93 94 #include <stdlib.h> /* for NULL */ 95 #include "gmp.h" 96 #include "gmp-impl.h" 97 98 99 /* In case k=0 (automatic choice), we distinguish 3 cases: 100 (a) dn < qn: in = ceil(qn / ceil(qn/dn)) 101 (b) dn/3 < qn <= dn: in = ceil(qn / 2) 102 (c) qn < dn/3: in = qn 103 In all cases we have in <= dn. 104 */ 105 mp_size_t 106 mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k) 107 { 108 mp_size_t in; 109 110 if (k == 0) 111 { 112 mp_size_t b; 113 if (qn > dn) 114 { 115 /* Compute an inverse size that is a nice partition of the quotient. */ 116 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ 117 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ 118 } 119 else if (3 * qn > dn) 120 { 121 in = (qn - 1) / 2 + 1; /* b = 2 */ 122 } 123 else 124 { 125 in = (qn - 1) / 1 + 1; /* b = 1 */ 126 } 127 } 128 else 129 { 130 mp_size_t xn; 131 xn = MIN (dn, qn); 132 in = (xn - 1) / k + 1; 133 } 134 135 return in; 136 } 137 138 mp_limb_t 139 mpn_mu_divappr_q (mp_ptr qp, 140 mp_ptr np, 141 mp_size_t nn, 142 mp_srcptr dp, 143 mp_size_t dn, 144 mp_ptr scratch) 145 { 146 mp_size_t qn, in; 147 mp_limb_t cy; 148 mp_ptr ip, tp; 149 150 /* FIXME: We should probably not handle tiny operands, but do it for now. */ 151 if (dn == 1) 152 { 153 mpn_divrem_1 (scratch, 0L, np, nn, dp[0]); 154 MPN_COPY (qp, scratch, nn - 1); 155 return scratch[nn - 1]; 156 } 157 158 qn = nn - dn; 159 160 #if 1 161 /* If Q is smaller than D, truncate operands. */ 162 if (qn + 1 < dn) 163 { 164 np += dn - (qn + 1); 165 nn -= dn - (qn + 1); 166 dp += dn - (qn + 1); 167 dn = qn + 1; 168 169 /* Since D is cut here, we can have a carry in N'/D' even if we don't 170 have it for N/D. */ 171 if (mpn_cmp (np + nn - (qn + 1), dp, qn + 1) >= 0) 172 { /* quotient is 111...111 */ 173 mp_size_t i; 174 for (i = 0; i <= qn; i ++) 175 qp[i] = ~ (mp_limb_t) 0; 176 return 0; 177 } 178 } 179 #endif 180 181 /* Compute the inverse size. */ 182 in = mpn_mu_divappr_q_choose_in (qn, dn, 0); 183 ASSERT (in <= dn); 184 185 #if 1 186 /* This alternative inverse computation method gets slightly more accurate 187 results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function 188 not adapted (3) mpn_invert scratch needs not met. */ 189 ip = scratch; 190 tp = scratch + in + 1; 191 192 /* compute an approximate inverse on (in+1) limbs */ 193 if (dn == in) 194 { 195 MPN_COPY (tp + 1, dp, in); 196 tp[0] = 1; 197 mpn_invert (ip, tp, in + 1, NULL); 198 MPN_COPY_INCR (ip, ip + 1, in); 199 } 200 else 201 { 202 cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1); 203 if (UNLIKELY (cy != 0)) 204 MPN_ZERO (ip, in); 205 else 206 { 207 mpn_invert (ip, tp, in + 1, NULL); 208 MPN_COPY_INCR (ip, ip + 1, in); 209 } 210 } 211 #else 212 /* This older inverse computation method gets slightly worse results than the 213 one above. */ 214 ip = scratch; 215 tp = scratch + in; 216 217 /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the 218 inversion function should do this automatically. */ 219 if (dn == in) 220 { 221 tp[in + 1] = 0; 222 MPN_COPY (tp + in + 2, dp, in); 223 mpn_invert (tp, tp + in + 1, in + 1, NULL); 224 } 225 else 226 { 227 mpn_invert (tp, dp + dn - (in + 1), in + 1, NULL); 228 } 229 cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT); 230 if (UNLIKELY (cy != 0)) 231 MPN_ZERO (tp + 1, in); 232 MPN_COPY (ip, tp + 1, in); 233 #endif 234 235 /* We can't really handle qh = 1 like this since we'd here clobber N, which is 236 not allowed in the way we've defined this function's API. */ 237 #if 0 238 qh = mpn_cmp (np + qn, dp, dn) >= 0; 239 if (qh != 0) 240 mpn_sub_n (np + qn, np + qn, dp, dn); 241 #endif 242 243 mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in); 244 245 /* return qh; */ 246 return 0; 247 } 248 249 void 250 mpn_preinv_mu_divappr_q (mp_ptr qp, 251 mp_ptr np, 252 mp_size_t nn, 253 mp_srcptr dp, 254 mp_size_t dn, 255 mp_srcptr ip, 256 mp_size_t in, 257 mp_ptr scratch) 258 { 259 mp_ptr rp; 260 mp_size_t qn; 261 mp_limb_t cy; 262 mp_ptr tp; 263 mp_limb_t r; 264 265 qn = nn - dn; 266 267 if (qn == 0) 268 return; 269 270 rp = scratch; 271 tp = scratch + dn; 272 273 np += qn; 274 qp += qn; 275 276 MPN_COPY (rp, np, dn); 277 278 while (qn > 0) 279 { 280 if (qn < in) 281 { 282 ip += in - qn; 283 in = qn; 284 } 285 np -= in; 286 qp -= in; 287 288 /* Compute the next block of quotient limbs by multiplying the inverse I 289 by the upper part of the partial remainder R. */ 290 mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */ 291 cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */ 292 ASSERT_ALWAYS (cy == 0); /* FIXME */ 293 294 qn -= in; 295 if (qn == 0) 296 break; 297 298 /* Compute the product of the quotient block and the divisor D, to be 299 subtracted from the partial remainder combined with new limbs from the 300 dividend N. We only really need the low dn limbs. */ 301 #if WANT_FFT 302 if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD)) 303 { 304 /* Use the wrap-around trick. */ 305 mp_size_t m, wn; 306 int k; 307 308 k = mpn_fft_best_k (dn + 1, 0); 309 m = mpn_fft_next_size (dn + 1, k); 310 wn = dn + in - m; /* number of wrapped limbs */ 311 312 mpn_mul_fft (tp, m, dp, dn, qp, in, k); 313 314 if (wn > 0) 315 { 316 cy = mpn_add_n (tp, tp, rp + dn - wn, wn); 317 mpn_incr_u (tp + wn, cy); 318 319 cy = mpn_cmp (rp + dn - in, tp + dn, m - dn) < 0; 320 mpn_decr_u (tp, cy); 321 } 322 } 323 else 324 #endif 325 mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */ 326 327 r = rp[dn - in] - tp[dn]; 328 329 /* Subtract the product from the partial remainder combined with new 330 limbs from the dividend N, generating a new partial remainder R. */ 331 if (dn != in) 332 { 333 cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */ 334 cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy); 335 MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */ 336 } 337 else 338 { 339 cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */ 340 } 341 342 STAT (int i; int err = 0; 343 static int errarr[5]; static int err_rec; static int tot); 344 345 /* Check the remainder R and adjust the quotient as needed. */ 346 r -= cy; 347 while (r != 0) 348 { 349 /* We loop 0 times with about 69% probability, 1 time with about 31% 350 probability, 2 times with about 0.6% probability, if inverse is 351 computed as recommended. */ 352 mpn_incr_u (qp, 1); 353 cy = mpn_sub_n (rp, rp, dp, dn); 354 r -= cy; 355 STAT (err++); 356 } 357 if (mpn_cmp (rp, dp, dn) >= 0) 358 { 359 /* This is executed with about 76% probability. */ 360 mpn_incr_u (qp, 1); 361 cy = mpn_sub_n (rp, rp, dp, dn); 362 STAT (err++); 363 } 364 365 STAT ( 366 tot++; 367 errarr[err]++; 368 if (err > err_rec) 369 err_rec = err; 370 if (tot % 0x10000 == 0) 371 { 372 for (i = 0; i <= err_rec; i++) 373 printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); 374 printf ("\n"); 375 } 376 ); 377 } 378 379 /* FIXME: We should perhaps be somewhat more elegant in our rounding of the 380 quotient. For now, just make sure the returned quotient is >= the real 381 quotient. */ 382 qn = nn - dn; 383 cy = mpn_add_1 (qp, qp, qn, 3); 384 if (cy != 0) 385 { 386 MPN_ZERO (qp, qn); 387 mpn_sub_1 (qp, qp, qn, 1); 388 } 389 } 390 391 mp_size_t 392 mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k) 393 { 394 mp_size_t qn, m; 395 int k; 396 397 /* FIXME: This isn't very carefully written, and might grossly overestimate 398 the amount of scratch needed, and might perhaps also underestimate it, 399 leading to potential buffer overruns. In particular k=0 might lead to 400 gross overestimates. */ 401 402 if (dn == 1) 403 return nn; 404 405 qn = nn - dn; 406 if (qn >= dn) 407 { 408 k = mpn_fft_best_k (dn + 1, 0); 409 m = mpn_fft_next_size (dn + 1, k); 410 return dn + (mua_k <= 1 411 ? 6 * dn 412 : m + 2 * dn); 413 } 414 else 415 { 416 k = mpn_fft_best_k (dn + 1, 0); 417 m = mpn_fft_next_size (dn + 1, k); 418 return dn + (mua_k <= 1 419 ? m + 4 * qn 420 : m + 2 * qn); 421 } 422 } 423