1 /* mpn_mu_div_qr, mpn_preinv_mu_div_qr. 2 3 Compute Q = floor(N / D) and R = N-QD. N is nn limbs and D is dn limbs and 4 must be normalized, and Q must be nn-dn limbs. The requirement that Q is 5 nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to 6 let N be unmodified during the operation. 7 8 Contributed to the GNU project by Torbjorn Granlund. 9 10 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS 11 ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS 12 ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP 13 RELEASE. 14 15 Copyright 2005, 2006, 2007 Free Software Foundation, Inc. 16 17 This file is part of the GNU MP Library. 18 19 The GNU MP Library is free software; you can redistribute it and/or modify 20 it under the terms of the GNU Lesser General Public License as published by 21 the Free Software Foundation; either version 3 of the License, or (at your 22 option) any later version. 23 24 The GNU MP Library is distributed in the hope that it will be useful, but 25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 27 License for more details. 28 29 You should have received a copy of the GNU Lesser General Public License 30 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 31 32 33 /* We use the "misunderstanding algorithm" (MUA), discovered by Paul Zimmermann 34 and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of 35 Jebelean's bidirectional exact division algorithm. 36 37 The idea of this algorithm is to compute a smaller inverted value than used 38 in the standard Barrett algorithm, and thus save time in the Newton 39 iterations, and pay just a small price when using the inverted value for 40 developing quotient bits. 41 42 Written by Torbjorn Granlund. Paul Zimmermann suggested the use of the 43 "wrap around" trick. Based on the GMP divexact code and inspired by code 44 contributed to GMP by Karl Hasselstroem. 45 */ 46 47 48 /* CAUTION: This code and the code in mu_divappr_q.c should be edited in lockstep. 49 50 Things to work on: 51 52 * Passing k isn't a great interface. Either 'in' should be passed, or 53 determined by the code. 54 55 * The current mpn_mu_div_qr_itch isn't exactly scientifically written. 56 Scratch space buffer overruns are not unlikely before some analysis is 57 applied. Since scratch requirements are expected to change, such an 58 analysis will have to wait til things settle. 59 60 * This isn't optimal when the remainder isn't needed, since the final 61 multiplication could be made special and take O(1) time on average, in that 62 case. This is particularly bad when qn << dn. At some level, code as in 63 GMP 4 mpn_tdiv_qr should be used, effectively dividing the leading 2qn 64 dividend limbs by the qn divisor limbs. 65 66 * This isn't optimal when the quotient isn't needed, as it might take a lot 67 of space. The computation is always needed, though, so there is not time 68 to save with special code. 69 70 * The itch/scratch scheme isn't perhaps such a good idea as it once seemed, 71 demonstrated by the fact that the mpn_inv function's scratch needs means 72 that we need to keep a large allocation long after it is needed. Things 73 are worse as mpn_mul_fft does not accept any scratch parameter, which means 74 we'll have a large memory hole while in mpn_mul_fft. In general, a peak 75 scratch need in the beginning of a function isn't well-handled by the 76 itch/scratch scheme. 77 78 * Some ideas from comments in divexact.c apply to this code too. 79 */ 80 81 /* the NOSTAT stuff handles properly the case where files are concatenated */ 82 #ifdef NOSTAT 83 #undef STAT 84 #endif 85 86 #ifdef STAT 87 #undef STAT 88 #define STAT(x) x 89 #else 90 #define NOSTAT 91 #define STAT(x) 92 #endif 93 94 #include <stdlib.h> /* for NULL */ 95 #include "gmp.h" 96 #include "gmp-impl.h" 97 98 99 /* In case k=0 (automatic choice), we distinguish 3 cases: 100 (a) dn < qn: in = ceil(qn / ceil(qn/dn)) 101 (b) dn/3 < qn <= dn: in = ceil(qn / 2) 102 (c) qn < dn/3: in = qn 103 In all cases we have in <= dn. 104 */ 105 mp_size_t 106 mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k) 107 { 108 mp_size_t in; 109 110 if (k == 0) 111 { 112 mp_size_t b; 113 if (qn > dn) 114 { 115 /* Compute an inverse size that is a nice partition of the quotient. */ 116 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ 117 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ 118 } 119 else if (3 * qn > dn) 120 { 121 in = (qn - 1) / 2 + 1; /* b = 2 */ 122 } 123 else 124 { 125 in = (qn - 1) / 1 + 1; /* b = 1 */ 126 } 127 } 128 else 129 { 130 mp_size_t xn; 131 xn = MIN (dn, qn); 132 in = (xn - 1) / k + 1; 133 } 134 135 return in; 136 } 137 138 static mp_limb_t 139 mpn_mu_div_qr2 (mp_ptr qp, 140 mp_ptr rp, 141 mp_ptr np, 142 mp_size_t nn, 143 mp_srcptr dp, 144 mp_size_t dn, 145 mp_ptr scratch) 146 { 147 mp_size_t qn, in; 148 mp_limb_t cy; 149 mp_ptr ip, tp; 150 151 /* FIXME: We should probably not handle tiny operands, but do it for now. */ 152 if (dn == 1) 153 { 154 rp[0] = mpn_divrem_1 (scratch, 0L, np, nn, dp[0]); 155 MPN_COPY (qp, scratch, nn - 1); 156 return scratch[nn - 1]; 157 } 158 159 qn = nn - dn; 160 161 /* Compute the inverse size. */ 162 in = mpn_mu_div_qr_choose_in (qn, dn, 0); 163 ASSERT (in <= dn); 164 165 #if 1 166 /* This alternative inverse computation method gets slightly more accurate 167 results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function 168 not adapted (3) mpn_invert scratch needs not met. */ 169 ip = scratch; 170 tp = scratch + in + 1; 171 172 /* compute an approximate inverse on (in+1) limbs */ 173 if (dn == in) 174 { 175 MPN_COPY (tp + 1, dp, in); 176 tp[0] = 1; 177 mpn_invert (ip, tp, in + 1, NULL); 178 MPN_COPY_INCR (ip, ip + 1, in); 179 } 180 else 181 { 182 cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1); 183 if (UNLIKELY (cy != 0)) 184 MPN_ZERO (ip, in); 185 else 186 { 187 mpn_invert (ip, tp, in + 1, NULL); 188 MPN_COPY_INCR (ip, ip + 1, in); 189 } 190 } 191 #else 192 /* This older inverse computation method gets slightly worse results than the 193 one above. */ 194 ip = scratch; 195 tp = scratch + in; 196 197 /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the 198 inversion function should do this automatically. */ 199 if (dn == in) 200 { 201 tp[in + 1] = 0; 202 MPN_COPY (tp + in + 2, dp, in); 203 mpn_invert (tp, tp + in + 1, in + 1, NULL); 204 } 205 else 206 { 207 mpn_invert (tp, dp + dn - (in + 1), in + 1, NULL); 208 } 209 cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT); 210 if (UNLIKELY (cy != 0)) 211 MPN_ZERO (tp + 1, in); 212 MPN_COPY (ip, tp + 1, in); 213 #endif 214 215 /* We can't really handle qh = 1 like this since we'd here clobber N, which is 216 not allowed in the way we've defined this function's API. */ 217 #if 0 218 qh = mpn_cmp (np + qn, dp, dn) >= 0; 219 if (qh != 0) 220 mpn_sub_n (np + qn, np + qn, dp, dn); 221 #endif 222 223 mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in); 224 225 /* return qh; */ 226 return 0; 227 } 228 229 void 230 mpn_preinv_mu_div_qr (mp_ptr qp, 231 mp_ptr rp, 232 mp_ptr np, 233 mp_size_t nn, 234 mp_srcptr dp, 235 mp_size_t dn, 236 mp_srcptr ip, 237 mp_size_t in, 238 mp_ptr scratch) 239 { 240 mp_size_t qn; 241 mp_limb_t cy; 242 mp_ptr tp; 243 mp_limb_t r; 244 245 qn = nn - dn; 246 247 if (qn == 0) 248 { 249 MPN_COPY (rp, np, dn); 250 return; 251 } 252 253 tp = scratch; 254 255 np += qn; 256 qp += qn; 257 258 MPN_COPY (rp, np, dn); 259 260 while (qn > 0) 261 { 262 if (qn < in) 263 { 264 ip += in - qn; 265 in = qn; 266 } 267 np -= in; 268 qp -= in; 269 270 /* Compute the next block of quotient limbs by multiplying the inverse I 271 by the upper part of the partial remainder R. */ 272 mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */ 273 cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */ 274 ASSERT_ALWAYS (cy == 0); /* FIXME */ 275 276 /* Compute the product of the quotient block and the divisor D, to be 277 subtracted from the partial remainder combined with new limbs from the 278 dividend N. We only really need the low dn limbs. */ 279 #if WANT_FFT 280 if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD)) 281 { 282 /* Use the wrap-around trick. */ 283 mp_size_t m, wn; 284 int k; 285 286 k = mpn_fft_best_k (dn + 1, 0); 287 m = mpn_fft_next_size (dn + 1, k); 288 wn = dn + in - m; /* number of wrapped limbs */ 289 290 mpn_mul_fft (tp, m, dp, dn, qp, in, k); 291 292 if (wn > 0) 293 { 294 cy = mpn_add_n (tp, tp, rp + dn - wn, wn); 295 mpn_incr_u (tp + wn, cy); 296 297 cy = mpn_cmp (rp + dn - in, tp + dn, m - dn) < 0; 298 mpn_decr_u (tp, cy); 299 } 300 } 301 else 302 #endif 303 mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */ 304 305 r = rp[dn - in] - tp[dn]; 306 307 /* Subtract the product from the partial remainder combined with new 308 limbs from the dividend N, generating a new partial remainder R. */ 309 if (dn != in) 310 { 311 cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */ 312 cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy); 313 MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */ 314 } 315 else 316 { 317 cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */ 318 } 319 320 STAT (int i; int err = 0; 321 static int errarr[5]; static int err_rec; static int tot); 322 323 /* Check the remainder R and adjust the quotient as needed. */ 324 r -= cy; 325 while (r != 0) 326 { 327 /* We loop 0 times with about 69% probability, 1 time with about 31% 328 probability, 2 times with about 0.6% probability, if inverse is 329 computed as recommended. */ 330 mpn_incr_u (qp, 1); 331 cy = mpn_sub_n (rp, rp, dp, dn); 332 r -= cy; 333 STAT (err++); 334 } 335 if (mpn_cmp (rp, dp, dn) >= 0) 336 { 337 /* This is executed with about 76% probability. */ 338 mpn_incr_u (qp, 1); 339 cy = mpn_sub_n (rp, rp, dp, dn); 340 STAT (err++); 341 } 342 343 STAT ( 344 tot++; 345 errarr[err]++; 346 if (err > err_rec) 347 err_rec = err; 348 if (tot % 0x10000 == 0) 349 { 350 for (i = 0; i <= err_rec; i++) 351 printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); 352 printf ("\n"); 353 } 354 ); 355 356 qn -= in; 357 } 358 } 359 360 #define THRES 100 /* FIXME: somewhat arbitrary */ 361 362 #ifdef CHECK 363 #undef THRES 364 #define THRES 1 365 #endif 366 367 mp_limb_t 368 mpn_mu_div_qr (mp_ptr qp, 369 mp_ptr rp, 370 mp_ptr np, 371 mp_size_t nn, 372 mp_srcptr dp, 373 mp_size_t dn, 374 mp_ptr scratch) 375 { 376 mp_size_t qn; 377 378 qn = nn - dn; 379 if (qn + THRES < dn) 380 { 381 /* |______________|________| dividend nn 382 |_______|________| divisor dn 383 384 |______| quotient (prel) qn 385 386 |_______________| quotient * ignored-part-of(divisor) dn-1 387 */ 388 389 mp_limb_t cy, x; 390 391 if (mpn_cmp (np + nn - (qn + 1), dp + dn - (qn + 1), qn + 1) >= 0) 392 { 393 /* Quotient is 111...111, could optimize this rare case at some point. */ 394 mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch); 395 return 0; 396 } 397 398 /* Compute a preliminary quotient and a partial remainder by dividing the 399 most significant limbs of each operand. */ 400 mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1), 401 np + nn - (2 * qn + 1), 2 * qn + 1, 402 dp + dn - (qn + 1), qn + 1, 403 scratch); 404 405 /* Multiply the quotient by the divisor limbs ignored above. */ 406 if (dn - (qn + 1) > qn) 407 mpn_mul (scratch, dp, dn - (qn + 1), qp, qn); /* prod is dn-1 limbs */ 408 else 409 mpn_mul (scratch, qp, qn, dp, dn - (qn + 1)); /* prod is dn-1 limbs */ 410 411 cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1)); 412 cy = mpn_sub_nc (rp + nn - (2 * qn + 1), 413 rp + nn - (2 * qn + 1), 414 scratch + nn - (2 * qn + 1), 415 qn, cy); 416 x = rp[dn - 1]; 417 rp[dn - 1] = x - cy; 418 if (cy > x) 419 { 420 mpn_decr_u (qp, 1); 421 mpn_add_n (rp, rp, dp, dn); 422 } 423 } 424 else 425 { 426 return mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch); 427 } 428 429 return 0; /* FIXME */ 430 } 431 432 mp_size_t 433 mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k) 434 { 435 mp_size_t qn, m; 436 int k; 437 438 /* FIXME: This isn't very carefully written, and might grossly overestimate 439 the amount of scratch needed, and might perhaps also underestimate it, 440 leading to potential buffer overruns. In particular k=0 might lead to 441 gross overestimates. */ 442 443 if (dn == 1) 444 return nn; 445 446 qn = nn - dn; 447 if (qn >= dn) 448 { 449 k = mpn_fft_best_k (dn + 1, 0); 450 m = mpn_fft_next_size (dn + 1, k); 451 return (mua_k <= 1 452 ? 6 * dn 453 : m + 2 * dn); 454 } 455 else 456 { 457 k = mpn_fft_best_k (dn + 1, 0); 458 m = mpn_fft_next_size (dn + 1, k); 459 return (mua_k <= 1 460 ? m + 4 * qn 461 : m + 2 * qn); 462 } 463 } 464