1 /* mpz_n_pow_ui -- mpn raised to ulong. 2 3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST 4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN 5 FUTURE GNU MP RELEASES. 6 7 Copyright 2001, 2002, 2005 Free Software Foundation, Inc. 8 9 This file is part of the GNU MP Library. 10 11 The GNU MP Library is free software; you can redistribute it and/or modify 12 it under the terms of the GNU Lesser General Public License as published by 13 the Free Software Foundation; either version 3 of the License, or (at your 14 option) any later version. 15 16 The GNU MP Library is distributed in the hope that it will be useful, but 17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 19 License for more details. 20 21 You should have received a copy of the GNU Lesser General Public License 22 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 23 24 #include "gmp.h" 25 #include "gmp-impl.h" 26 #include "longlong.h" 27 28 29 /* Change this to "#define TRACE(x) x" for some traces. */ 30 #define TRACE(x) 31 32 33 /* Use this to test the mul_2 code on a CPU without a native version of that 34 routine. */ 35 #if 0 36 #define mpn_mul_2 refmpn_mul_2 37 #define HAVE_NATIVE_mpn_mul_2 1 38 #endif 39 40 41 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code. 42 ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on 43 bsize==2 or >2, but separating that isn't easy because there's shared 44 code both before and after (the size calculations and the powers of 2 45 handling). 46 47 Alternatives: 48 49 It would work to just use the mpn_mul powering loop for 1 and 2 limb 50 bases, but the current separate loop allows mul_1 and mul_2 to be done 51 in-place, which might help cache locality a bit. If mpn_mul was relaxed 52 to allow source==dest when vn==1 or 2 then some pointer twiddling might 53 let us get the same effect in one loop. 54 55 The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't 56 form the biggest possible power of b that fits, only the biggest power of 57 2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps 58 using mp_bases[b].big_base for small b, and thereby get better value 59 from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing 60 so would be more complicated than it's worth, and could well end up being 61 a slowdown for small e. For big e on the other hand the algorithm is 62 dominated by mpn_sqr so there wouldn't much of a saving. The current 63 code can be viewed as simply doing the first few steps of the powering in 64 a single or double limb where possible. 65 66 If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary 67 copy made of b is unnecessary. We could just use the old alloc'ed block 68 and free it at the end. But arranging this seems like a lot more trouble 69 than it's worth. */ 70 71 72 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in 73 a limb without overflowing. 74 FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */ 75 76 #define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1) 77 78 79 /* The following are for convenience, they update the size and check the 80 alloc. */ 81 82 #define MPN_SQR(dst, alloc, src, size) \ 83 do { \ 84 ASSERT (2*(size) <= (alloc)); \ 85 mpn_sqr (dst, src, size); \ 86 (size) *= 2; \ 87 (size) -= ((dst)[(size)-1] == 0); \ 88 } while (0) 89 90 #define MPN_MUL(dst, alloc, src, size, src2, size2) \ 91 do { \ 92 mp_limb_t cy; \ 93 ASSERT ((size) + (size2) <= (alloc)); \ 94 cy = mpn_mul (dst, src, size, src2, size2); \ 95 (size) += (size2) - (cy == 0); \ 96 } while (0) 97 98 #define MPN_MUL_2(ptr, size, alloc, mult) \ 99 do { \ 100 mp_limb_t cy; \ 101 ASSERT ((size)+2 <= (alloc)); \ 102 cy = mpn_mul_2 (ptr, ptr, size, mult); \ 103 (size)++; \ 104 (ptr)[(size)] = cy; \ 105 (size) += (cy != 0); \ 106 } while (0) 107 108 #define MPN_MUL_1(ptr, size, alloc, limb) \ 109 do { \ 110 mp_limb_t cy; \ 111 ASSERT ((size)+1 <= (alloc)); \ 112 cy = mpn_mul_1 (ptr, ptr, size, limb); \ 113 (ptr)[size] = cy; \ 114 (size) += (cy != 0); \ 115 } while (0) 116 117 #define MPN_LSHIFT(ptr, size, alloc, shift) \ 118 do { \ 119 mp_limb_t cy; \ 120 ASSERT ((size)+1 <= (alloc)); \ 121 cy = mpn_lshift (ptr, ptr, size, shift); \ 122 (ptr)[size] = cy; \ 123 (size) += (cy != 0); \ 124 } while (0) 125 126 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \ 127 do { \ 128 if ((shift) == 0) \ 129 MPN_COPY (dst, src, size); \ 130 else \ 131 { \ 132 mpn_rshift (dst, src, size, shift); \ 133 (size) -= ((dst)[(size)-1] == 0); \ 134 } \ 135 } while (0) 136 137 138 /* ralloc and talloc are only wanted for ASSERTs, after the initial space 139 allocations. Avoid writing values to them in a normal build, to ensure 140 the compiler lets them go dead. gcc already figures this out itself 141 actually. */ 142 143 #define SWAP_RP_TP \ 144 do { \ 145 MP_PTR_SWAP (rp, tp); \ 146 ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \ 147 } while (0) 148 149 150 void 151 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e) 152 { 153 mp_ptr rp; 154 mp_size_t rtwos_limbs, ralloc, rsize; 155 int rneg, i, cnt, btwos, r_bp_overlap; 156 mp_limb_t blimb, rl; 157 mp_bitcnt_t rtwos_bits; 158 #if HAVE_NATIVE_mpn_mul_2 159 mp_limb_t blimb_low, rl_high; 160 #else 161 mp_limb_t b_twolimbs[2]; 162 #endif 163 TMP_DECL; 164 165 TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n", 166 PTR(r), bp, bsize, e, e); 167 mpn_trace ("b", bp, bsize)); 168 169 ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0); 170 ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ABSIZ(r), bp, bsize)); 171 172 /* b^0 == 1, including 0^0 == 1 */ 173 if (e == 0) 174 { 175 PTR(r)[0] = 1; 176 SIZ(r) = 1; 177 return; 178 } 179 180 /* 0^e == 0 apart from 0^0 above */ 181 if (bsize == 0) 182 { 183 SIZ(r) = 0; 184 return; 185 } 186 187 /* Sign of the final result. */ 188 rneg = (bsize < 0 && (e & 1) != 0); 189 bsize = ABS (bsize); 190 TRACE (printf ("rneg %d\n", rneg)); 191 192 r_bp_overlap = (PTR(r) == bp); 193 194 /* Strip low zero limbs from b. */ 195 rtwos_limbs = 0; 196 for (blimb = *bp; blimb == 0; blimb = *++bp) 197 { 198 rtwos_limbs += e; 199 bsize--; ASSERT (bsize >= 1); 200 } 201 TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs)); 202 203 /* Strip low zero bits from b. */ 204 count_trailing_zeros (btwos, blimb); 205 blimb >>= btwos; 206 rtwos_bits = e * btwos; 207 rtwos_limbs += rtwos_bits / GMP_NUMB_BITS; 208 rtwos_bits %= GMP_NUMB_BITS; 209 TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n", 210 btwos, rtwos_limbs, rtwos_bits)); 211 212 TMP_MARK; 213 214 rl = 1; 215 #if HAVE_NATIVE_mpn_mul_2 216 rl_high = 0; 217 #endif 218 219 if (bsize == 1) 220 { 221 bsize_1: 222 /* Power up as far as possible within blimb. We start here with e!=0, 223 but if e is small then we might reach e==0 and the whole b^e in rl. 224 Notice this code works when blimb==1 too, reaching e==0. */ 225 226 while (blimb <= GMP_NUMB_HALFMAX) 227 { 228 TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n", 229 e, blimb, rl)); 230 ASSERT (e != 0); 231 if ((e & 1) != 0) 232 rl *= blimb; 233 e >>= 1; 234 if (e == 0) 235 goto got_rl; 236 blimb *= blimb; 237 } 238 239 #if HAVE_NATIVE_mpn_mul_2 240 TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n", 241 e, blimb, rl)); 242 243 /* Can power b once more into blimb:blimb_low */ 244 bsize = 2; 245 ASSERT (e != 0); 246 if ((e & 1) != 0) 247 { 248 umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS); 249 rl >>= GMP_NAIL_BITS; 250 } 251 e >>= 1; 252 umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS); 253 blimb_low >>= GMP_NAIL_BITS; 254 255 got_rl: 256 TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n", 257 e, blimb, blimb_low, rl_high, rl)); 258 259 /* Combine left-over rtwos_bits into rl_high:rl to be handled by the 260 final mul_1 or mul_2 rather than a separate lshift. 261 - rl_high:rl mustn't be 1 (since then there's no final mul) 262 - rl_high mustn't overflow 263 - rl_high mustn't change to non-zero, since mul_1+lshift is 264 probably faster than mul_2 (FIXME: is this true?) */ 265 266 if (rtwos_bits != 0 267 && ! (rl_high == 0 && rl == 1) 268 && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0) 269 { 270 mp_limb_t new_rl_high = (rl_high << rtwos_bits) 271 | (rl >> (GMP_NUMB_BITS-rtwos_bits)); 272 if (! (rl_high == 0 && new_rl_high != 0)) 273 { 274 rl_high = new_rl_high; 275 rl <<= rtwos_bits; 276 rtwos_bits = 0; 277 TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n", 278 rl_high, rl)); 279 } 280 } 281 #else 282 got_rl: 283 TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n", 284 e, blimb, rl)); 285 286 /* Combine left-over rtwos_bits into rl to be handled by the final 287 mul_1 rather than a separate lshift. 288 - rl mustn't be 1 (since then there's no final mul) 289 - rl mustn't overflow */ 290 291 if (rtwos_bits != 0 292 && rl != 1 293 && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0) 294 { 295 rl <<= rtwos_bits; 296 rtwos_bits = 0; 297 TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl)); 298 } 299 #endif 300 } 301 else if (bsize == 2) 302 { 303 mp_limb_t bsecond = bp[1]; 304 if (btwos != 0) 305 blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK; 306 bsecond >>= btwos; 307 if (bsecond == 0) 308 { 309 /* Two limbs became one after rshift. */ 310 bsize = 1; 311 goto bsize_1; 312 } 313 314 TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb)); 315 #if HAVE_NATIVE_mpn_mul_2 316 blimb_low = blimb; 317 #else 318 bp = b_twolimbs; 319 b_twolimbs[0] = blimb; 320 b_twolimbs[1] = bsecond; 321 #endif 322 blimb = bsecond; 323 } 324 else 325 { 326 if (r_bp_overlap || btwos != 0) 327 { 328 mp_ptr tp = TMP_ALLOC_LIMBS (bsize); 329 MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos); 330 bp = tp; 331 TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize)); 332 } 333 #if HAVE_NATIVE_mpn_mul_2 334 /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */ 335 blimb_low = bp[0]; 336 #endif 337 blimb = bp[bsize-1]; 338 339 TRACE (printf ("big bsize=%ld ", bsize); 340 mpn_trace ("b", bp, bsize)); 341 } 342 343 /* At this point blimb is the most significant limb of the base to use. 344 345 Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1 346 limb to round up the division; +1 for multiplies all using an extra 347 limb over the true size; +2 for rl at the end; +1 for lshift at the 348 end. 349 350 The size calculation here is reasonably accurate. The base is at least 351 half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits 352 when it will power up as just over 16, an overestimate of 17/16 = 353 6.25%. For a 64-bit limb it's half that. 354 355 If e==0 then blimb won't be anything useful (though it will be 356 non-zero), but that doesn't matter since we just end up with ralloc==5, 357 and that's fine for 2 limbs of rl and 1 of lshift. */ 358 359 ASSERT (blimb != 0); 360 count_leading_zeros (cnt, blimb); 361 ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5; 362 TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n", 363 ralloc, bsize, blimb, cnt)); 364 MPZ_REALLOC (r, ralloc + rtwos_limbs); 365 rp = PTR(r); 366 367 /* Low zero limbs resulting from powers of 2. */ 368 MPN_ZERO (rp, rtwos_limbs); 369 rp += rtwos_limbs; 370 371 if (e == 0) 372 { 373 /* Any e==0 other than via bsize==1 or bsize==2 is covered at the 374 start. */ 375 rp[0] = rl; 376 rsize = 1; 377 #if HAVE_NATIVE_mpn_mul_2 378 rp[1] = rl_high; 379 rsize += (rl_high != 0); 380 #endif 381 ASSERT (rp[rsize-1] != 0); 382 } 383 else 384 { 385 mp_ptr tp; 386 mp_size_t talloc; 387 388 /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the 389 low bit of e is zero, tp only has to hold the second last power 390 step, which is half the size of the final result. There's no need 391 to round up the divide by 2, since ralloc includes a +2 for rl 392 which not needed by tp. In the mpn_mul loop when the low bit of e 393 is 1, tp must hold nearly the full result, so just size it the same 394 as rp. */ 395 396 talloc = ralloc; 397 #if HAVE_NATIVE_mpn_mul_2 398 if (bsize <= 2 || (e & 1) == 0) 399 talloc /= 2; 400 #else 401 if (bsize <= 1 || (e & 1) == 0) 402 talloc /= 2; 403 #endif 404 TRACE (printf ("talloc %ld\n", talloc)); 405 tp = TMP_ALLOC_LIMBS (talloc); 406 407 /* Go from high to low over the bits of e, starting with i pointing at 408 the bit below the highest 1 (which will mean i==-1 if e==1). */ 409 count_leading_zeros (cnt, e); 410 i = GMP_LIMB_BITS - cnt - 2; 411 412 #if HAVE_NATIVE_mpn_mul_2 413 if (bsize <= 2) 414 { 415 mp_limb_t mult[2]; 416 417 /* Any bsize==1 will have been powered above to be two limbs. */ 418 ASSERT (bsize == 2); 419 ASSERT (blimb != 0); 420 421 /* Arrange the final result ends up in r, not in the temp space */ 422 if ((i & 1) == 0) 423 SWAP_RP_TP; 424 425 rp[0] = blimb_low; 426 rp[1] = blimb; 427 rsize = 2; 428 429 mult[0] = blimb_low; 430 mult[1] = blimb; 431 432 for ( ; i >= 0; i--) 433 { 434 TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n", 435 i, e, rsize, ralloc, talloc); 436 mpn_trace ("r", rp, rsize)); 437 438 MPN_SQR (tp, talloc, rp, rsize); 439 SWAP_RP_TP; 440 if ((e & (1L << i)) != 0) 441 MPN_MUL_2 (rp, rsize, ralloc, mult); 442 } 443 444 TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize)); 445 if (rl_high != 0) 446 { 447 mult[0] = rl; 448 mult[1] = rl_high; 449 MPN_MUL_2 (rp, rsize, ralloc, mult); 450 } 451 else if (rl != 1) 452 MPN_MUL_1 (rp, rsize, ralloc, rl); 453 } 454 #else 455 if (bsize == 1) 456 { 457 /* Arrange the final result ends up in r, not in the temp space */ 458 if ((i & 1) == 0) 459 SWAP_RP_TP; 460 461 rp[0] = blimb; 462 rsize = 1; 463 464 for ( ; i >= 0; i--) 465 { 466 TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n", 467 i, e, rsize, ralloc, talloc); 468 mpn_trace ("r", rp, rsize)); 469 470 MPN_SQR (tp, talloc, rp, rsize); 471 SWAP_RP_TP; 472 if ((e & (1L << i)) != 0) 473 MPN_MUL_1 (rp, rsize, ralloc, blimb); 474 } 475 476 TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize)); 477 if (rl != 1) 478 MPN_MUL_1 (rp, rsize, ralloc, rl); 479 } 480 #endif 481 else 482 { 483 int parity; 484 485 /* Arrange the final result ends up in r, not in the temp space */ 486 ULONG_PARITY (parity, e); 487 if (((parity ^ i) & 1) != 0) 488 SWAP_RP_TP; 489 490 MPN_COPY (rp, bp, bsize); 491 rsize = bsize; 492 493 for ( ; i >= 0; i--) 494 { 495 TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n", 496 i, e, rsize, ralloc, talloc); 497 mpn_trace ("r", rp, rsize)); 498 499 MPN_SQR (tp, talloc, rp, rsize); 500 SWAP_RP_TP; 501 if ((e & (1L << i)) != 0) 502 { 503 MPN_MUL (tp, talloc, rp, rsize, bp, bsize); 504 SWAP_RP_TP; 505 } 506 } 507 } 508 } 509 510 ASSERT (rp == PTR(r) + rtwos_limbs); 511 TRACE (mpn_trace ("end loop r", rp, rsize)); 512 TMP_FREE; 513 514 /* Apply any partial limb factors of 2. */ 515 if (rtwos_bits != 0) 516 { 517 MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits); 518 TRACE (mpn_trace ("lshift r", rp, rsize)); 519 } 520 521 rsize += rtwos_limbs; 522 SIZ(r) = (rneg ? -rsize : rsize); 523 } 524