1 /* mpn_mul_n and helper function -- Multiply/square natural numbers. 2 3 THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n) ARE 4 INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH 5 DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE 6 OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 7 8 Copyright 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 9 2005 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of the GNU Lesser General Public License as published by 15 the Free Software Foundation; either version 3 of the License, or (at your 16 option) any later version. 17 18 The GNU MP Library is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 20 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 21 License for more details. 22 23 You should have received a copy of the GNU Lesser General Public License 24 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 25 26 #include "gmp.h" 27 #include "gmp-impl.h" 28 #include "longlong.h" 29 30 31 /* Multiplies using 3 half-sized mults and so on recursively. 32 * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1]. 33 * No overlap of p[...] with a[...] or b[...]. 34 * ws is workspace. 35 */ 36 37 void 38 mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) 39 { 40 mp_limb_t w, w0, w1; 41 mp_size_t n2; 42 mp_srcptr x, y; 43 mp_size_t i; 44 int sign; 45 46 n2 = n >> 1; 47 ASSERT (n2 > 0); 48 49 if ((n & 1) != 0) 50 { 51 /* Odd length. */ 52 mp_size_t n1, n3, nm1; 53 54 n3 = n - n2; 55 56 sign = 0; 57 w = a[n2]; 58 if (w != 0) 59 w -= mpn_sub_n (p, a, a + n3, n2); 60 else 61 { 62 i = n2; 63 do 64 { 65 --i; 66 w0 = a[i]; 67 w1 = a[n3 + i]; 68 } 69 while (w0 == w1 && i != 0); 70 if (w0 < w1) 71 { 72 x = a + n3; 73 y = a; 74 sign = ~0; 75 } 76 else 77 { 78 x = a; 79 y = a + n3; 80 } 81 mpn_sub_n (p, x, y, n2); 82 } 83 p[n2] = w; 84 85 w = b[n2]; 86 if (w != 0) 87 w -= mpn_sub_n (p + n3, b, b + n3, n2); 88 else 89 { 90 i = n2; 91 do 92 { 93 --i; 94 w0 = b[i]; 95 w1 = b[n3 + i]; 96 } 97 while (w0 == w1 && i != 0); 98 if (w0 < w1) 99 { 100 x = b + n3; 101 y = b; 102 sign = ~sign; 103 } 104 else 105 { 106 x = b; 107 y = b + n3; 108 } 109 mpn_sub_n (p + n3, x, y, n2); 110 } 111 p[n] = w; 112 113 n1 = n + 1; 114 if (n2 < MUL_KARATSUBA_THRESHOLD) 115 { 116 if (n3 < MUL_KARATSUBA_THRESHOLD) 117 { 118 mpn_mul_basecase (ws, p, n3, p + n3, n3); 119 mpn_mul_basecase (p, a, n3, b, n3); 120 } 121 else 122 { 123 mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); 124 mpn_kara_mul_n (p, a, b, n3, ws + n1); 125 } 126 mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2); 127 } 128 else 129 { 130 mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); 131 mpn_kara_mul_n (p, a, b, n3, ws + n1); 132 mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1); 133 } 134 135 if (sign) 136 mpn_add_n (ws, p, ws, n1); 137 else 138 mpn_sub_n (ws, p, ws, n1); 139 140 nm1 = n - 1; 141 if (mpn_add_n (ws, p + n1, ws, nm1)) 142 { 143 mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK; 144 ws[nm1] = x; 145 if (x == 0) 146 ws[n] = (ws[n] + 1) & GMP_NUMB_MASK; 147 } 148 if (mpn_add_n (p + n3, p + n3, ws, n1)) 149 { 150 mpn_incr_u (p + n1 + n3, 1); 151 } 152 } 153 else 154 { 155 /* Even length. */ 156 i = n2; 157 do 158 { 159 --i; 160 w0 = a[i]; 161 w1 = a[n2 + i]; 162 } 163 while (w0 == w1 && i != 0); 164 sign = 0; 165 if (w0 < w1) 166 { 167 x = a + n2; 168 y = a; 169 sign = ~0; 170 } 171 else 172 { 173 x = a; 174 y = a + n2; 175 } 176 mpn_sub_n (p, x, y, n2); 177 178 i = n2; 179 do 180 { 181 --i; 182 w0 = b[i]; 183 w1 = b[n2 + i]; 184 } 185 while (w0 == w1 && i != 0); 186 if (w0 < w1) 187 { 188 x = b + n2; 189 y = b; 190 sign = ~sign; 191 } 192 else 193 { 194 x = b; 195 y = b + n2; 196 } 197 mpn_sub_n (p + n2, x, y, n2); 198 199 /* Pointwise products. */ 200 if (n2 < MUL_KARATSUBA_THRESHOLD) 201 { 202 mpn_mul_basecase (ws, p, n2, p + n2, n2); 203 mpn_mul_basecase (p, a, n2, b, n2); 204 mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2); 205 } 206 else 207 { 208 mpn_kara_mul_n (ws, p, p + n2, n2, ws + n); 209 mpn_kara_mul_n (p, a, b, n2, ws + n); 210 mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n); 211 } 212 213 /* Interpolate. */ 214 if (sign) 215 w = mpn_add_n (ws, p, ws, n); 216 else 217 w = -mpn_sub_n (ws, p, ws, n); 218 w += mpn_add_n (ws, p + n, ws, n); 219 w += mpn_add_n (p + n2, p + n2, ws, n); 220 MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w); 221 } 222 } 223 224 void 225 mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) 226 { 227 mp_limb_t w, w0, w1; 228 mp_size_t n2; 229 mp_srcptr x, y; 230 mp_size_t i; 231 232 n2 = n >> 1; 233 ASSERT (n2 > 0); 234 235 if ((n & 1) != 0) 236 { 237 /* Odd length. */ 238 mp_size_t n1, n3, nm1; 239 240 n3 = n - n2; 241 242 w = a[n2]; 243 if (w != 0) 244 w -= mpn_sub_n (p, a, a + n3, n2); 245 else 246 { 247 i = n2; 248 do 249 { 250 --i; 251 w0 = a[i]; 252 w1 = a[n3 + i]; 253 } 254 while (w0 == w1 && i != 0); 255 if (w0 < w1) 256 { 257 x = a + n3; 258 y = a; 259 } 260 else 261 { 262 x = a; 263 y = a + n3; 264 } 265 mpn_sub_n (p, x, y, n2); 266 } 267 p[n2] = w; 268 269 n1 = n + 1; 270 271 /* n2 is always either n3 or n3-1 so maybe the two sets of tests here 272 could be combined. But that's not important, since the tests will 273 take a miniscule amount of time compared to the function calls. */ 274 if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD)) 275 { 276 mpn_mul_basecase (ws, p, n3, p, n3); 277 mpn_mul_basecase (p, a, n3, a, n3); 278 } 279 else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD)) 280 { 281 mpn_sqr_basecase (ws, p, n3); 282 mpn_sqr_basecase (p, a, n3); 283 } 284 else 285 { 286 mpn_kara_sqr_n (ws, p, n3, ws + n1); /* (x-y)^2 */ 287 mpn_kara_sqr_n (p, a, n3, ws + n1); /* x^2 */ 288 } 289 if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD)) 290 mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2); 291 else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD)) 292 mpn_sqr_basecase (p + n1, a + n3, n2); 293 else 294 mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); /* y^2 */ 295 296 /* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the 297 borrow from mpn_sub_n. If it occurs then it'll be cancelled by a 298 carry from ws[n]. Further, since 2xy fits in n1 limbs there won't 299 be any carry out of ws[n] other than cancelling that borrow. */ 300 301 mpn_sub_n (ws, p, ws, n1); /* x^2-(x-y)^2 */ 302 303 nm1 = n - 1; 304 if (mpn_add_n (ws, p + n1, ws, nm1)) /* x^2+y^2-(x-y)^2 = 2xy */ 305 { 306 mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK; 307 ws[nm1] = x; 308 if (x == 0) 309 ws[n] = (ws[n] + 1) & GMP_NUMB_MASK; 310 } 311 if (mpn_add_n (p + n3, p + n3, ws, n1)) 312 { 313 mpn_incr_u (p + n1 + n3, 1); 314 } 315 } 316 else 317 { 318 /* Even length. */ 319 i = n2; 320 do 321 { 322 --i; 323 w0 = a[i]; 324 w1 = a[n2 + i]; 325 } 326 while (w0 == w1 && i != 0); 327 if (w0 < w1) 328 { 329 x = a + n2; 330 y = a; 331 } 332 else 333 { 334 x = a; 335 y = a + n2; 336 } 337 mpn_sub_n (p, x, y, n2); 338 339 /* Pointwise products. */ 340 if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD)) 341 { 342 mpn_mul_basecase (ws, p, n2, p, n2); 343 mpn_mul_basecase (p, a, n2, a, n2); 344 mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2); 345 } 346 else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD)) 347 { 348 mpn_sqr_basecase (ws, p, n2); 349 mpn_sqr_basecase (p, a, n2); 350 mpn_sqr_basecase (p + n, a + n2, n2); 351 } 352 else 353 { 354 mpn_kara_sqr_n (ws, p, n2, ws + n); 355 mpn_kara_sqr_n (p, a, n2, ws + n); 356 mpn_kara_sqr_n (p + n, a + n2, n2, ws + n); 357 } 358 359 /* Interpolate. */ 360 w = -mpn_sub_n (ws, p, ws, n); 361 w += mpn_add_n (ws, p + n, ws, n); 362 w += mpn_add_n (p + n2, p + n2, ws, n); 363 MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w); 364 } 365 } 366 367 /****************************************************************************** 368 * * 369 * Toom 3-way multiplication and squaring * 370 * * 371 *****************************************************************************/ 372 373 /* Starts from: 374 {v0,2k} (stored in {c,2k}) 375 {vm1,2k+1} (which sign is sa, and absolute value is stored in {vm1,2k+1}) 376 {v1,2k+1} (stored in {c+2k,2k+1}) 377 {v2,2k+1} 378 {vinf,twor} (stored in {c+4k,twor}, except the first limb, saved in vinf0) 379 380 ws is temporary space, and should have at least twor limbs. 381 382 put in {c, 2n} where n = 2k+twor the value of {v0,2k} (already in place) 383 + B^k * {tm1, 2k+1} 384 + B^(2k) * {t1, 2k+1} 385 + B^(3k) * {t2, 2k+1} 386 + B^(4k) * {vinf,twor} (high twor-1 limbs already in place) 387 where {t1, 2k+1} = ({v1, 2k+1} + sa * {vm1, 2k+1}- 2*{v0,2k})/2-*{vinf,twor} 388 {t2, 2k+1} = (3*({v1,2k+1}-{v0,2k})-sa*{vm1,2k+1}+{v2,2k+1})/6-2*{vinf,twor} 389 {tm1,2k+1} = ({v1,2k+1}-sa*{vm1,2k+1}/2-{t2,2k+1} 390 391 Exact sequence described in a comment in mpn_toom3_mul_n. 392 mpn_toom3_mul_n() and mpn_toom3_sqr_n() implement steps 1-2. 393 mpn_toom_interpolate_5pts() implements steps 3-4. 394 395 Reference: What About Toom-Cook Matrices Optimality? Marco Bodrato 396 and Alberto Zanoni, October 19, 2006, http://bodrato.it/papers/#CIVV2006 397 398 ************* saved note **************** 399 Think about: 400 401 The evaluated point a-b+c stands a good chance of having a zero carry 402 limb, a+b+c would have a 1/4 chance, and 4*a+2*b+c a 1/8 chance, roughly. 403 Perhaps this could be tested and stripped. Doing so before recursing 404 would be better than stripping at the start of mpn_toom3_mul_n/sqr_n, 405 since then the recursion could be based on the new size. Although in 406 truth the kara vs toom3 crossover is never so exact that one limb either 407 way makes a difference. 408 409 A small value like 1 or 2 for the carry could perhaps also be handled 410 with an add_n or addlsh1_n. Would that be faster than an extra limb on a 411 (recursed) multiply/square? 412 */ 413 414 #define TOOM3_MUL_REC(p, a, b, n, ws) \ 415 do { \ 416 if (MUL_TOOM3_THRESHOLD / 3 < MUL_KARATSUBA_THRESHOLD \ 417 && BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) \ 418 mpn_mul_basecase (p, a, n, b, n); \ 419 else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD)) \ 420 mpn_kara_mul_n (p, a, b, n, ws); \ 421 else \ 422 mpn_toom3_mul_n (p, a, b, n, ws); \ 423 } while (0) 424 425 #define TOOM3_SQR_REC(p, a, n, ws) \ 426 do { \ 427 if (SQR_TOOM3_THRESHOLD / 3 < SQR_BASECASE_THRESHOLD \ 428 && BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) \ 429 mpn_mul_basecase (p, a, n, a, n); \ 430 else if (SQR_TOOM3_THRESHOLD / 3 < SQR_KARATSUBA_THRESHOLD \ 431 && BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) \ 432 mpn_sqr_basecase (p, a, n); \ 433 else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \ 434 mpn_kara_sqr_n (p, a, n, ws); \ 435 else \ 436 mpn_toom3_sqr_n (p, a, n, ws); \ 437 } while (0) 438 439 /* The necessary temporary space T(n) satisfies T(n)=0 for n < THRESHOLD, 440 and T(n) <= max(2n+2, 6k+3, 4k+3+T(k+1)) otherwise, where k = ceil(n/3). 441 442 Assuming T(n) >= 2n, 6k+3 <= 4k+3+T(k+1). 443 Similarly, 2n+2 <= 6k+2 <= 4k+3+T(k+1). 444 445 With T(n) = 2n+S(n), this simplifies to S(n) <= 9 + S(k+1). 446 Since THRESHOLD >= 17, we have n/(k+1) >= 19/8 447 thus S(n) <= S(n/(19/8)) + 9 thus S(n) <= 9*log(n)/log(19/8) <= 8*log2(n). 448 */ 449 450 void 451 mpn_toom3_mul_n (mp_ptr c, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr t) 452 { 453 mp_size_t k, k1, kk1, r, twok, twor; 454 mp_limb_t cy, cc, saved, vinf0; 455 mp_ptr trec; 456 int sa, sb; 457 mp_ptr c1, c2, c3, c4, c5; 458 459 ASSERT(GMP_NUMB_BITS >= 6); 460 ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */ 461 462 /* 463 The algorithm is the following: 464 465 0. k = ceil(n/3), r = n - 2k, B = 2^(GMP_NUMB_BITS), t = B^k 466 1. split a and b in three parts each a0, a1, a2 and b0, b1, b2 467 with a0, a1, b0, b1 of k limbs, and a2, b2 of r limbs 468 2. Evaluation: vm1 may be negative, the other can not. 469 v0 <- a0*b0 470 v1 <- (a0+a1+a2)*(b0+b1+b2) 471 v2 <- (a0+2*a1+4*a2)*(b0+2*b1+4*b2) 472 vm1 <- (a0-a1+a2)*(b0-b1+b2) 473 vinf <- a2*b2 474 3. Interpolation: every result is positive, all divisions are exact 475 t2 <- (v2 - vm1)/3 476 tm1 <- (v1 - vm1)/2 477 t1 <- (v1 - v0) 478 t2 <- (t2 - t1)/2 479 t1 <- (t1 - tm1 - vinf) 480 t2 <- (t2 - 2*vinf) 481 tm1 <- (tm1 - t2) 482 4. result is c0+c1*t+c2*t^2+c3*t^3+c4*t^4 where 483 c0 <- v0 484 c1 <- tm1 485 c2 <- t1 486 c3 <- t2 487 c4 <- vinf 488 */ 489 490 k = (n + 2) / 3; /* ceil(n/3) */ 491 twok = 2 * k; 492 k1 = k + 1; 493 kk1 = k + k1; 494 r = n - twok; /* last chunk */ 495 twor = 2 * r; 496 497 c1 = c + k; 498 c2 = c1 + k; 499 c3 = c2 + k; 500 c4 = c3 + k; 501 c5 = c4 + k; 502 503 trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */ 504 505 /* put a0+a2 in {c, k+1}, and b0+b2 in {c+4k+2, k+1}; 506 put a0+a1+a2 in {t, k+1} and b0+b1+b2 in {t+k+1,k+1} 507 [????requires 5k+3 <= 2n, ie. n >= 9] */ 508 cy = mpn_add_n (c, a, a + twok, r); 509 cc = mpn_add_n (c4 + 2, b, b + twok, r); 510 if (r < k) 511 { 512 __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy); 513 __GMPN_ADD_1 (cc, c4 + 2 + r, b + r, k - r, cc); 514 } 515 516 /* Put in {t, k+1} the sum 517 * (a_0+a_2) - stored in {c, k+1} - 518 * + 519 * a_1 - stored in {a+k, k} */ 520 t[k] = (c1[0] = cy) + mpn_add_n (t, c, a + k, k); 521 /* ^ ^ 522 * carry of a_0 + a_2 carry of (a_0+a_2) + a_1 523 524 */ 525 526 /* Put in {t+k+1, k+1} the sum of the two values 527 * (b_0+b_2) - stored in {c1+1, k+1} - 528 * + 529 * b_1 - stored in {b+k, k} */ 530 t[kk1] = (c5[3] = cc) + mpn_add_n (t + k1, c4 + 2, b + k, k); 531 /* ^ ^ 532 * carry of b_0 + b_2 carry of (b_0+b_2) + b_1 */ 533 534 #define v2 (t+2*k+1) 535 536 /* compute v1 := (a0+a1+a2)*(b0+b1+b2) in {t, 2k+1}; 537 since v1 < 9*B^(2k), v1 uses only 2k+1 words if GMP_NUMB_BITS >= 4 */ 538 TOOM3_MUL_REC (c2, t, t + k1, k1, trec); 539 540 /* c c2 c4 t 541 {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} 542 v1 */ 543 544 /* put |a0-a1+a2| in {c, k+1} and |b0-b1+b2| in {c+4k+2,k+1} */ 545 /* (They're already there, actually) */ 546 547 /* sa = sign(a0-a1+a2) */ 548 sa = (cy != 0) ? 1 : mpn_cmp (c, a + k, k); 549 c[k] = (sa >= 0) ? cy - mpn_sub_n (c, c, a + k, k) 550 : mpn_sub_n (c, a + k, c, k); 551 552 sb = (cc != 0) ? 1 : mpn_cmp (c4 + 2, b + k, k); 553 c5[2] = (sb >= 0) ? cc - mpn_sub_n (c4 + 2, c4 + 2, b + k, k) 554 : mpn_sub_n (c4 + 2, b + k, c4 + 2, k); 555 sa *= sb; /* sign of vm1 */ 556 557 /* compute vm1 := (a0-a1+a2)*(b0-b1+b2) in {t, 2k+1}; 558 since |vm1| < 4*B^(2k), vm1 uses only 2k+1 limbs */ 559 TOOM3_MUL_REC (t, c, c4 + 2, k1, trec); 560 561 /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} 562 v1 vm1 563 */ 564 565 /* compute a0+2a1+4a2 in {c, k+1} and b0+2b1+4b2 in {c+4k+2, k+1} 566 [requires 5k+3 <= 2n, i.e. n >= 17] */ 567 #ifdef HAVE_NATIVE_mpn_addlsh1_n 568 c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r); 569 c5[2] = mpn_addlsh1_n (c4 + 2, b + k, b + twok, r); 570 if (r < k) 571 { 572 __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]); 573 __GMPN_ADD_1 (c5[2], c4 + 2 + r, b + k + r, k - r, c5[2]); 574 } 575 c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k); 576 c5[2] = 2 * c5[2] + mpn_addlsh1_n (c4 + 2, b, c4 + 2, k); 577 #else 578 c[r] = mpn_lshift (c, a + twok, r, 1); 579 c4[r + 2] = mpn_lshift (c4 + 2, b + twok, r, 1); 580 if (r < k) 581 { 582 MPN_ZERO(c + r + 1, k - r); 583 MPN_ZERO(c4 + r + 3, k - r); 584 } 585 c1[0] += mpn_add_n (c, c, a + k, k); 586 c5[2] += mpn_add_n (c4 + 2, c4 + 2, b + k, k); 587 mpn_lshift (c, c, k1, 1); 588 mpn_lshift (c4 + 2, c4 + 2, k1, 1); 589 c1[0] += mpn_add_n (c, c, a, k); 590 c5[2] += mpn_add_n (c4 + 2, c4 + 2, b, k); 591 #endif 592 593 /* compute v2 := (a0+2a1+4a2)*(b0+2b1+4b2) in {t+2k+1, 2k+1} 594 v2 < 49*B^k so v2 uses at most 2k+1 limbs if GMP_NUMB_BITS >= 6 */ 595 TOOM3_MUL_REC (v2, c, c4 + 2, k1, trec); 596 597 /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} 598 v1 vm1 v2 599 */ 600 601 /* compute v0 := a0*b0 in {c, 2k} */ 602 TOOM3_MUL_REC (c, a, b, k, trec); 603 604 /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} 605 v0 v1 vm1 v2 */ 606 607 /* compute vinf := a2*b2 in {t+4k+2, 2r}: in {c4, 2r} */ 608 609 saved = c4[0]; /* Remember v1's highest byte (will be overwritten). */ 610 TOOM3_MUL_REC (c4, a + twok, b + twok, r, trec); /* Overwrites c4[0]. */ 611 vinf0 = c4[0]; /* Remember vinf's lowest byte (will be overwritten).*/ 612 c4[0] = saved; /* Overwriting. Now v1 value is correct. */ 613 614 /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} 615 v0 v1 vinf[1..] vm1 v2 */ 616 617 mpn_toom_interpolate_5pts (c, v2, t, k, 2*r, sa, vinf0, trec); 618 619 #undef v2 620 } 621 622 void 623 mpn_toom3_sqr_n (mp_ptr c, mp_srcptr a, mp_size_t n, mp_ptr t) 624 { 625 mp_size_t k, k1, kk1, r, twok, twor; 626 mp_limb_t cy, saved, vinf0; 627 mp_ptr trec; 628 int sa; 629 mp_ptr c1, c2, c3, c4; 630 631 ASSERT(GMP_NUMB_BITS >= 6); 632 ASSERT(n >= 17); /* so that r <> 0 and 5k+3 <= 2n */ 633 634 /* the algorithm is the same as mpn_toom3_mul_n, with b=a */ 635 636 k = (n + 2) / 3; /* ceil(n/3) */ 637 twok = 2 * k; 638 k1 = k + 1; 639 kk1 = k + k1; 640 r = n - twok; /* last chunk */ 641 twor = 2 * r; 642 643 c1 = c + k; 644 c2 = c1 + k; 645 c3 = c2 + k; 646 c4 = c3 + k; 647 648 trec = t + 4 * k + 3; /* trec = v2 + (2k+2) */ 649 650 cy = mpn_add_n (c, a, a + twok, r); 651 if (r < k) 652 __GMPN_ADD_1 (cy, c + r, a + r, k - r, cy); 653 t[k] = (c1[0] = cy) + mpn_add_n (t, c, a + k, k); 654 655 #define v2 (t+2*k+1) 656 657 TOOM3_SQR_REC (c2, t, k1, trec); 658 659 sa = (cy != 0) ? 1 : mpn_cmp (c, a + k, k); 660 c[k] = (sa >= 0) ? cy - mpn_sub_n (c, c, a + k, k) 661 : mpn_sub_n (c, a + k, c, k); 662 663 TOOM3_SQR_REC (t, c, k1, trec); 664 665 #ifdef HAVE_NATIVE_mpn_addlsh1_n 666 c1[0] = mpn_addlsh1_n (c, a + k, a + twok, r); 667 if (r < k) 668 __GMPN_ADD_1 (c1[0], c + r, a + k + r, k - r, c1[0]); 669 c1[0] = 2 * c1[0] + mpn_addlsh1_n (c, a, c, k); 670 #else 671 c[r] = mpn_lshift (c, a + twok, r, 1); 672 if (r < k) 673 MPN_ZERO(c + r + 1, k - r); 674 c1[0] += mpn_add_n (c, c, a + k, k); 675 mpn_lshift (c, c, k1, 1); 676 c1[0] += mpn_add_n (c, c, a, k); 677 #endif 678 679 TOOM3_SQR_REC (v2, c, k1, trec); 680 681 TOOM3_SQR_REC (c, a, k, trec); 682 683 saved = c4[0]; 684 TOOM3_SQR_REC (c4, a + twok, r, trec); 685 vinf0 = c4[0]; 686 c4[0] = saved; 687 688 mpn_toom_interpolate_5pts (c, v2, t, k, 2*r, 1, vinf0, trec); 689 690 #undef v2 691 } 692 693 void 694 mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) 695 { 696 ASSERT (n >= 1); 697 ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); 698 ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n)); 699 700 if (BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) 701 { 702 mpn_mul_basecase (p, a, n, b, n); 703 } 704 else if (BELOW_THRESHOLD (n, MUL_TOOM3_THRESHOLD)) 705 { 706 /* Allocate workspace of fixed size on stack: fast! */ 707 mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)]; 708 ASSERT (MUL_TOOM3_THRESHOLD <= MUL_TOOM3_THRESHOLD_LIMIT); 709 mpn_kara_mul_n (p, a, b, n, ws); 710 } 711 else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) 712 { 713 mp_ptr ws; 714 TMP_SDECL; 715 TMP_SMARK; 716 ws = TMP_SALLOC_LIMBS (MPN_TOOM3_MUL_N_TSIZE (n)); 717 mpn_toom3_mul_n (p, a, b, n, ws); 718 TMP_SFREE; 719 } 720 #if WANT_FFT || TUNE_PROGRAM_BUILD 721 else if (BELOW_THRESHOLD (n, MUL_FFT_THRESHOLD)) 722 #else 723 else if (BELOW_THRESHOLD (n, MPN_TOOM44_MAX_N)) 724 #endif 725 { 726 mp_ptr ws; 727 TMP_SDECL; 728 TMP_SMARK; 729 ws = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (n, n)); 730 mpn_toom44_mul (p, a, n, b, n, ws); 731 TMP_SFREE; 732 } 733 else 734 #if WANT_FFT || TUNE_PROGRAM_BUILD 735 { 736 /* The current FFT code allocates its own space. That should probably 737 change. */ 738 mpn_mul_fft_full (p, a, n, b, n); 739 } 740 #else 741 { 742 /* Toom4 for large operands. */ 743 mp_ptr ws; 744 TMP_DECL; 745 TMP_MARK; 746 ws = TMP_BALLOC_LIMBS (mpn_toom44_mul_itch (n, n)); 747 mpn_toom44_mul (p, a, n, b, n, ws); 748 TMP_FREE; 749 } 750 #endif 751 } 752 753 void 754 mpn_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n) 755 { 756 ASSERT (n >= 1); 757 ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); 758 759 #if 0 760 /* FIXME: Can this be removed? */ 761 if (n == 0) 762 return; 763 #endif 764 765 if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) 766 { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ 767 mpn_mul_basecase (p, a, n, a, n); 768 } 769 else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) 770 { 771 mpn_sqr_basecase (p, a, n); 772 } 773 else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) 774 { 775 /* Allocate workspace of fixed size on stack: fast! */ 776 mp_limb_t ws[MPN_KARA_SQR_N_TSIZE (SQR_TOOM3_THRESHOLD_LIMIT-1)]; 777 ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); 778 mpn_kara_sqr_n (p, a, n, ws); 779 } 780 else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) 781 { 782 mp_ptr ws; 783 TMP_SDECL; 784 TMP_SMARK; 785 ws = TMP_SALLOC_LIMBS (MPN_TOOM3_SQR_N_TSIZE (n)); 786 mpn_toom3_sqr_n (p, a, n, ws); 787 TMP_SFREE; 788 } 789 #if WANT_FFT || TUNE_PROGRAM_BUILD 790 else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD)) 791 #else 792 else if (BELOW_THRESHOLD (n, MPN_TOOM44_MAX_N)) 793 #endif 794 { 795 mp_ptr ws; 796 TMP_SDECL; 797 TMP_SMARK; 798 ws = TMP_SALLOC_LIMBS (mpn_toom4_sqr_itch (n)); 799 mpn_toom4_sqr (p, a, n, ws); 800 TMP_SFREE; 801 } 802 else 803 #if WANT_FFT || TUNE_PROGRAM_BUILD 804 { 805 /* The current FFT code allocates its own space. That should probably 806 change. */ 807 mpn_mul_fft_full (p, a, n, a, n); 808 } 809 #else 810 { 811 /* Toom4 for large operands. */ 812 mp_ptr ws; 813 TMP_DECL; 814 TMP_MARK; 815 ws = TMP_BALLOC_LIMBS (mpn_toom4_sqr_itch (n)); 816 mpn_toom4_sqr (p, a, n, ws); 817 TMP_FREE; 818 } 819 #endif 820 } 821