1 /* mpz_n_pow_ui -- mpn raised to ulong.
2
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
6
7 Copyright 2001, 2002, 2005 Free Software Foundation, Inc.
8
9 This file is part of the GNU MP Library.
10
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of the GNU Lesser General Public License as published by
13 the Free Software Foundation; either version 3 of the License, or (at your
14 option) any later version.
15
16 The GNU MP Library is distributed in the hope that it will be useful, but
17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
19 License for more details.
20
21 You should have received a copy of the GNU Lesser General Public License
22 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
23
24 #include "gmp.h"
25 #include "gmp-impl.h"
26 #include "longlong.h"
27
28
29 /* Change this to "#define TRACE(x) x" for some traces. */
30 #define TRACE(x)
31
32
33 /* Use this to test the mul_2 code on a CPU without a native version of that
34 routine. */
35 #if 0
36 #define mpn_mul_2 refmpn_mul_2
37 #define HAVE_NATIVE_mpn_mul_2 1
38 #endif
39
40
41 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
42 ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
43 bsize==2 or >2, but separating that isn't easy because there's shared
44 code both before and after (the size calculations and the powers of 2
45 handling).
46
47 Alternatives:
48
49 It would work to just use the mpn_mul powering loop for 1 and 2 limb
50 bases, but the current separate loop allows mul_1 and mul_2 to be done
51 in-place, which might help cache locality a bit. If mpn_mul was relaxed
52 to allow source==dest when vn==1 or 2 then some pointer twiddling might
53 let us get the same effect in one loop.
54
55 The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
56 form the biggest possible power of b that fits, only the biggest power of
57 2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
58 using mp_bases[b].big_base for small b, and thereby get better value
59 from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
60 so would be more complicated than it's worth, and could well end up being
61 a slowdown for small e. For big e on the other hand the algorithm is
62 dominated by mpn_sqr so there wouldn't much of a saving. The current
63 code can be viewed as simply doing the first few steps of the powering in
64 a single or double limb where possible.
65
66 If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
67 copy made of b is unnecessary. We could just use the old alloc'ed block
68 and free it at the end. But arranging this seems like a lot more trouble
69 than it's worth. */
70
71
72 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
73 a limb without overflowing.
74 FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
75
76 #define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
77
78
79 /* The following are for convenience, they update the size and check the
80 alloc. */
81
82 #define MPN_SQR(dst, alloc, src, size) \
83 do { \
84 ASSERT (2*(size) <= (alloc)); \
85 mpn_sqr (dst, src, size); \
86 (size) *= 2; \
87 (size) -= ((dst)[(size)-1] == 0); \
88 } while (0)
89
90 #define MPN_MUL(dst, alloc, src, size, src2, size2) \
91 do { \
92 mp_limb_t cy; \
93 ASSERT ((size) + (size2) <= (alloc)); \
94 cy = mpn_mul (dst, src, size, src2, size2); \
95 (size) += (size2) - (cy == 0); \
96 } while (0)
97
98 #define MPN_MUL_2(ptr, size, alloc, mult) \
99 do { \
100 mp_limb_t cy; \
101 ASSERT ((size)+2 <= (alloc)); \
102 cy = mpn_mul_2 (ptr, ptr, size, mult); \
103 (size)++; \
104 (ptr)[(size)] = cy; \
105 (size) += (cy != 0); \
106 } while (0)
107
108 #define MPN_MUL_1(ptr, size, alloc, limb) \
109 do { \
110 mp_limb_t cy; \
111 ASSERT ((size)+1 <= (alloc)); \
112 cy = mpn_mul_1 (ptr, ptr, size, limb); \
113 (ptr)[size] = cy; \
114 (size) += (cy != 0); \
115 } while (0)
116
117 #define MPN_LSHIFT(ptr, size, alloc, shift) \
118 do { \
119 mp_limb_t cy; \
120 ASSERT ((size)+1 <= (alloc)); \
121 cy = mpn_lshift (ptr, ptr, size, shift); \
122 (ptr)[size] = cy; \
123 (size) += (cy != 0); \
124 } while (0)
125
126 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
127 do { \
128 if ((shift) == 0) \
129 MPN_COPY (dst, src, size); \
130 else \
131 { \
132 mpn_rshift (dst, src, size, shift); \
133 (size) -= ((dst)[(size)-1] == 0); \
134 } \
135 } while (0)
136
137
138 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
139 allocations. Avoid writing values to them in a normal build, to ensure
140 the compiler lets them go dead. gcc already figures this out itself
141 actually. */
142
143 #define SWAP_RP_TP \
144 do { \
145 MP_PTR_SWAP (rp, tp); \
146 ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
147 } while (0)
148
149
150 void
mpz_n_pow_ui(mpz_ptr r,mp_srcptr bp,mp_size_t bsize,unsigned long int e)151 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
152 {
153 mp_ptr rp;
154 mp_size_t rtwos_limbs, ralloc, rsize;
155 int rneg, i, cnt, btwos, r_bp_overlap;
156 mp_limb_t blimb, rl;
157 mp_bitcnt_t rtwos_bits;
158 #if HAVE_NATIVE_mpn_mul_2
159 mp_limb_t blimb_low, rl_high;
160 #else
161 mp_limb_t b_twolimbs[2];
162 #endif
163 TMP_DECL;
164
165 TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
166 PTR(r), bp, bsize, e, e);
167 mpn_trace ("b", bp, bsize));
168
169 ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
170 ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ABSIZ(r), bp, bsize));
171
172 /* b^0 == 1, including 0^0 == 1 */
173 if (e == 0)
174 {
175 PTR(r)[0] = 1;
176 SIZ(r) = 1;
177 return;
178 }
179
180 /* 0^e == 0 apart from 0^0 above */
181 if (bsize == 0)
182 {
183 SIZ(r) = 0;
184 return;
185 }
186
187 /* Sign of the final result. */
188 rneg = (bsize < 0 && (e & 1) != 0);
189 bsize = ABS (bsize);
190 TRACE (printf ("rneg %d\n", rneg));
191
192 r_bp_overlap = (PTR(r) == bp);
193
194 /* Strip low zero limbs from b. */
195 rtwos_limbs = 0;
196 for (blimb = *bp; blimb == 0; blimb = *++bp)
197 {
198 rtwos_limbs += e;
199 bsize--; ASSERT (bsize >= 1);
200 }
201 TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
202
203 /* Strip low zero bits from b. */
204 count_trailing_zeros (btwos, blimb);
205 blimb >>= btwos;
206 rtwos_bits = e * btwos;
207 rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
208 rtwos_bits %= GMP_NUMB_BITS;
209 TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
210 btwos, rtwos_limbs, rtwos_bits));
211
212 TMP_MARK;
213
214 rl = 1;
215 #if HAVE_NATIVE_mpn_mul_2
216 rl_high = 0;
217 #endif
218
219 if (bsize == 1)
220 {
221 bsize_1:
222 /* Power up as far as possible within blimb. We start here with e!=0,
223 but if e is small then we might reach e==0 and the whole b^e in rl.
224 Notice this code works when blimb==1 too, reaching e==0. */
225
226 while (blimb <= GMP_NUMB_HALFMAX)
227 {
228 TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
229 e, blimb, rl));
230 ASSERT (e != 0);
231 if ((e & 1) != 0)
232 rl *= blimb;
233 e >>= 1;
234 if (e == 0)
235 goto got_rl;
236 blimb *= blimb;
237 }
238
239 #if HAVE_NATIVE_mpn_mul_2
240 TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
241 e, blimb, rl));
242
243 /* Can power b once more into blimb:blimb_low */
244 bsize = 2;
245 ASSERT (e != 0);
246 if ((e & 1) != 0)
247 {
248 umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
249 rl >>= GMP_NAIL_BITS;
250 }
251 e >>= 1;
252 umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
253 blimb_low >>= GMP_NAIL_BITS;
254
255 got_rl:
256 TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
257 e, blimb, blimb_low, rl_high, rl));
258
259 /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
260 final mul_1 or mul_2 rather than a separate lshift.
261 - rl_high:rl mustn't be 1 (since then there's no final mul)
262 - rl_high mustn't overflow
263 - rl_high mustn't change to non-zero, since mul_1+lshift is
264 probably faster than mul_2 (FIXME: is this true?) */
265
266 if (rtwos_bits != 0
267 && ! (rl_high == 0 && rl == 1)
268 && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
269 {
270 mp_limb_t new_rl_high = (rl_high << rtwos_bits)
271 | (rl >> (GMP_NUMB_BITS-rtwos_bits));
272 if (! (rl_high == 0 && new_rl_high != 0))
273 {
274 rl_high = new_rl_high;
275 rl <<= rtwos_bits;
276 rtwos_bits = 0;
277 TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
278 rl_high, rl));
279 }
280 }
281 #else
282 got_rl:
283 TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
284 e, blimb, rl));
285
286 /* Combine left-over rtwos_bits into rl to be handled by the final
287 mul_1 rather than a separate lshift.
288 - rl mustn't be 1 (since then there's no final mul)
289 - rl mustn't overflow */
290
291 if (rtwos_bits != 0
292 && rl != 1
293 && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
294 {
295 rl <<= rtwos_bits;
296 rtwos_bits = 0;
297 TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
298 }
299 #endif
300 }
301 else if (bsize == 2)
302 {
303 mp_limb_t bsecond = bp[1];
304 if (btwos != 0)
305 blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
306 bsecond >>= btwos;
307 if (bsecond == 0)
308 {
309 /* Two limbs became one after rshift. */
310 bsize = 1;
311 goto bsize_1;
312 }
313
314 TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
315 #if HAVE_NATIVE_mpn_mul_2
316 blimb_low = blimb;
317 #else
318 bp = b_twolimbs;
319 b_twolimbs[0] = blimb;
320 b_twolimbs[1] = bsecond;
321 #endif
322 blimb = bsecond;
323 }
324 else
325 {
326 if (r_bp_overlap || btwos != 0)
327 {
328 mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
329 MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
330 bp = tp;
331 TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
332 }
333 #if HAVE_NATIVE_mpn_mul_2
334 /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
335 blimb_low = bp[0];
336 #endif
337 blimb = bp[bsize-1];
338
339 TRACE (printf ("big bsize=%ld ", bsize);
340 mpn_trace ("b", bp, bsize));
341 }
342
343 /* At this point blimb is the most significant limb of the base to use.
344
345 Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
346 limb to round up the division; +1 for multiplies all using an extra
347 limb over the true size; +2 for rl at the end; +1 for lshift at the
348 end.
349
350 The size calculation here is reasonably accurate. The base is at least
351 half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
352 when it will power up as just over 16, an overestimate of 17/16 =
353 6.25%. For a 64-bit limb it's half that.
354
355 If e==0 then blimb won't be anything useful (though it will be
356 non-zero), but that doesn't matter since we just end up with ralloc==5,
357 and that's fine for 2 limbs of rl and 1 of lshift. */
358
359 ASSERT (blimb != 0);
360 count_leading_zeros (cnt, blimb);
361 ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
362 TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
363 ralloc, bsize, blimb, cnt));
364 MPZ_REALLOC (r, ralloc + rtwos_limbs);
365 rp = PTR(r);
366
367 /* Low zero limbs resulting from powers of 2. */
368 MPN_ZERO (rp, rtwos_limbs);
369 rp += rtwos_limbs;
370
371 if (e == 0)
372 {
373 /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
374 start. */
375 rp[0] = rl;
376 rsize = 1;
377 #if HAVE_NATIVE_mpn_mul_2
378 rp[1] = rl_high;
379 rsize += (rl_high != 0);
380 #endif
381 ASSERT (rp[rsize-1] != 0);
382 }
383 else
384 {
385 mp_ptr tp;
386 mp_size_t talloc;
387
388 /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
389 low bit of e is zero, tp only has to hold the second last power
390 step, which is half the size of the final result. There's no need
391 to round up the divide by 2, since ralloc includes a +2 for rl
392 which not needed by tp. In the mpn_mul loop when the low bit of e
393 is 1, tp must hold nearly the full result, so just size it the same
394 as rp. */
395
396 talloc = ralloc;
397 #if HAVE_NATIVE_mpn_mul_2
398 if (bsize <= 2 || (e & 1) == 0)
399 talloc /= 2;
400 #else
401 if (bsize <= 1 || (e & 1) == 0)
402 talloc /= 2;
403 #endif
404 TRACE (printf ("talloc %ld\n", talloc));
405 tp = TMP_ALLOC_LIMBS (talloc);
406
407 /* Go from high to low over the bits of e, starting with i pointing at
408 the bit below the highest 1 (which will mean i==-1 if e==1). */
409 count_leading_zeros (cnt, e);
410 i = GMP_LIMB_BITS - cnt - 2;
411
412 #if HAVE_NATIVE_mpn_mul_2
413 if (bsize <= 2)
414 {
415 mp_limb_t mult[2];
416
417 /* Any bsize==1 will have been powered above to be two limbs. */
418 ASSERT (bsize == 2);
419 ASSERT (blimb != 0);
420
421 /* Arrange the final result ends up in r, not in the temp space */
422 if ((i & 1) == 0)
423 SWAP_RP_TP;
424
425 rp[0] = blimb_low;
426 rp[1] = blimb;
427 rsize = 2;
428
429 mult[0] = blimb_low;
430 mult[1] = blimb;
431
432 for ( ; i >= 0; i--)
433 {
434 TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
435 i, e, rsize, ralloc, talloc);
436 mpn_trace ("r", rp, rsize));
437
438 MPN_SQR (tp, talloc, rp, rsize);
439 SWAP_RP_TP;
440 if ((e & (1L << i)) != 0)
441 MPN_MUL_2 (rp, rsize, ralloc, mult);
442 }
443
444 TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
445 if (rl_high != 0)
446 {
447 mult[0] = rl;
448 mult[1] = rl_high;
449 MPN_MUL_2 (rp, rsize, ralloc, mult);
450 }
451 else if (rl != 1)
452 MPN_MUL_1 (rp, rsize, ralloc, rl);
453 }
454 #else
455 if (bsize == 1)
456 {
457 /* Arrange the final result ends up in r, not in the temp space */
458 if ((i & 1) == 0)
459 SWAP_RP_TP;
460
461 rp[0] = blimb;
462 rsize = 1;
463
464 for ( ; i >= 0; i--)
465 {
466 TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
467 i, e, rsize, ralloc, talloc);
468 mpn_trace ("r", rp, rsize));
469
470 MPN_SQR (tp, talloc, rp, rsize);
471 SWAP_RP_TP;
472 if ((e & (1L << i)) != 0)
473 MPN_MUL_1 (rp, rsize, ralloc, blimb);
474 }
475
476 TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
477 if (rl != 1)
478 MPN_MUL_1 (rp, rsize, ralloc, rl);
479 }
480 #endif
481 else
482 {
483 int parity;
484
485 /* Arrange the final result ends up in r, not in the temp space */
486 ULONG_PARITY (parity, e);
487 if (((parity ^ i) & 1) != 0)
488 SWAP_RP_TP;
489
490 MPN_COPY (rp, bp, bsize);
491 rsize = bsize;
492
493 for ( ; i >= 0; i--)
494 {
495 TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
496 i, e, rsize, ralloc, talloc);
497 mpn_trace ("r", rp, rsize));
498
499 MPN_SQR (tp, talloc, rp, rsize);
500 SWAP_RP_TP;
501 if ((e & (1L << i)) != 0)
502 {
503 MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
504 SWAP_RP_TP;
505 }
506 }
507 }
508 }
509
510 ASSERT (rp == PTR(r) + rtwos_limbs);
511 TRACE (mpn_trace ("end loop r", rp, rsize));
512 TMP_FREE;
513
514 /* Apply any partial limb factors of 2. */
515 if (rtwos_bits != 0)
516 {
517 MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
518 TRACE (mpn_trace ("lshift r", rp, rsize));
519 }
520
521 rsize += rtwos_limbs;
522 SIZ(r) = (rneg ? -rsize : rsize);
523 }
524