1 /* mpz_n_pow_ui -- mpn raised to ulong.
2
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
6
7 Copyright 2001, 2002, 2005, 2012 Free Software Foundation, Inc.
8
9 This file is part of the GNU MP Library.
10
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
13
14 * the GNU Lesser General Public License as published by the Free
15 Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
17
18 or
19
20 * the GNU General Public License as published by the Free Software
21 Foundation; either version 2 of the License, or (at your option) any
22 later version.
23
24 or both in parallel, as here.
25
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 for more details.
30
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library. If not,
33 see https://www.gnu.org/licenses/. */
34
35 #include "gmp.h"
36 #include "gmp-impl.h"
37 #include "longlong.h"
38
39
40 /* Change this to "#define TRACE(x) x" for some traces. */
41 #define TRACE(x)
42
43
44 /* Use this to test the mul_2 code on a CPU without a native version of that
45 routine. */
46 #if 0
47 #define mpn_mul_2 refmpn_mul_2
48 #define HAVE_NATIVE_mpn_mul_2 1
49 #endif
50
51
52 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
53 ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
54 bsize==2 or >2, but separating that isn't easy because there's shared
55 code both before and after (the size calculations and the powers of 2
56 handling).
57
58 Alternatives:
59
60 It would work to just use the mpn_mul powering loop for 1 and 2 limb
61 bases, but the current separate loop allows mul_1 and mul_2 to be done
62 in-place, which might help cache locality a bit. If mpn_mul was relaxed
63 to allow source==dest when vn==1 or 2 then some pointer twiddling might
64 let us get the same effect in one loop.
65
66 The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
67 form the biggest possible power of b that fits, only the biggest power of
68 2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
69 using mp_bases[b].big_base for small b, and thereby get better value
70 from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
71 so would be more complicated than it's worth, and could well end up being
72 a slowdown for small e. For big e on the other hand the algorithm is
73 dominated by mpn_sqr so there wouldn't much of a saving. The current
74 code can be viewed as simply doing the first few steps of the powering in
75 a single or double limb where possible.
76
77 If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
78 copy made of b is unnecessary. We could just use the old alloc'ed block
79 and free it at the end. But arranging this seems like a lot more trouble
80 than it's worth. */
81
82
83 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
84 a limb without overflowing.
85 FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
86
87 #define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
88
89
90 /* The following are for convenience, they update the size and check the
91 alloc. */
92
93 #define MPN_SQR(dst, alloc, src, size) \
94 do { \
95 ASSERT (2*(size) <= (alloc)); \
96 mpn_sqr (dst, src, size); \
97 (size) *= 2; \
98 (size) -= ((dst)[(size)-1] == 0); \
99 } while (0)
100
101 #define MPN_MUL(dst, alloc, src, size, src2, size2) \
102 do { \
103 mp_limb_t cy; \
104 ASSERT ((size) + (size2) <= (alloc)); \
105 cy = mpn_mul (dst, src, size, src2, size2); \
106 (size) += (size2) - (cy == 0); \
107 } while (0)
108
109 #define MPN_MUL_2(ptr, size, alloc, mult) \
110 do { \
111 mp_limb_t cy; \
112 ASSERT ((size)+2 <= (alloc)); \
113 cy = mpn_mul_2 (ptr, ptr, size, mult); \
114 (size)++; \
115 (ptr)[(size)] = cy; \
116 (size) += (cy != 0); \
117 } while (0)
118
119 #define MPN_MUL_1(ptr, size, alloc, limb) \
120 do { \
121 mp_limb_t cy; \
122 ASSERT ((size)+1 <= (alloc)); \
123 cy = mpn_mul_1 (ptr, ptr, size, limb); \
124 (ptr)[size] = cy; \
125 (size) += (cy != 0); \
126 } while (0)
127
128 #define MPN_LSHIFT(ptr, size, alloc, shift) \
129 do { \
130 mp_limb_t cy; \
131 ASSERT ((size)+1 <= (alloc)); \
132 cy = mpn_lshift (ptr, ptr, size, shift); \
133 (ptr)[size] = cy; \
134 (size) += (cy != 0); \
135 } while (0)
136
137 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
138 do { \
139 if ((shift) == 0) \
140 MPN_COPY (dst, src, size); \
141 else \
142 { \
143 mpn_rshift (dst, src, size, shift); \
144 (size) -= ((dst)[(size)-1] == 0); \
145 } \
146 } while (0)
147
148
149 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
150 allocations. Avoid writing values to them in a normal build, to ensure
151 the compiler lets them go dead. gcc already figures this out itself
152 actually. */
153
154 #define SWAP_RP_TP \
155 do { \
156 MP_PTR_SWAP (rp, tp); \
157 ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
158 } while (0)
159
160
161 void
mpz_n_pow_ui(mpz_ptr r,mp_srcptr bp,mp_size_t bsize,unsigned long int e)162 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
163 {
164 mp_ptr rp;
165 mp_size_t rtwos_limbs, ralloc, rsize;
166 int rneg, i, cnt, btwos, r_bp_overlap;
167 mp_limb_t blimb, rl;
168 mp_bitcnt_t rtwos_bits;
169 #if HAVE_NATIVE_mpn_mul_2
170 mp_limb_t blimb_low, rl_high;
171 #else
172 mp_limb_t b_twolimbs[2];
173 #endif
174 TMP_DECL;
175
176 TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
177 PTR(r), bp, bsize, e, e);
178 mpn_trace ("b", bp, bsize));
179
180 ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
181 ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
182
183 /* b^0 == 1, including 0^0 == 1 */
184 if (e == 0)
185 {
186 PTR(r)[0] = 1;
187 SIZ(r) = 1;
188 return;
189 }
190
191 /* 0^e == 0 apart from 0^0 above */
192 if (bsize == 0)
193 {
194 SIZ(r) = 0;
195 return;
196 }
197
198 /* Sign of the final result. */
199 rneg = (bsize < 0 && (e & 1) != 0);
200 bsize = ABS (bsize);
201 TRACE (printf ("rneg %d\n", rneg));
202
203 r_bp_overlap = (PTR(r) == bp);
204
205 /* Strip low zero limbs from b. */
206 rtwos_limbs = 0;
207 for (blimb = *bp; blimb == 0; blimb = *++bp)
208 {
209 rtwos_limbs += e;
210 bsize--; ASSERT (bsize >= 1);
211 }
212 TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
213
214 /* Strip low zero bits from b. */
215 count_trailing_zeros (btwos, blimb);
216 blimb >>= btwos;
217 rtwos_bits = e * btwos;
218 rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
219 rtwos_bits %= GMP_NUMB_BITS;
220 TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
221 btwos, rtwos_limbs, rtwos_bits));
222
223 TMP_MARK;
224
225 rl = 1;
226 #if HAVE_NATIVE_mpn_mul_2
227 rl_high = 0;
228 #endif
229
230 if (bsize == 1)
231 {
232 bsize_1:
233 /* Power up as far as possible within blimb. We start here with e!=0,
234 but if e is small then we might reach e==0 and the whole b^e in rl.
235 Notice this code works when blimb==1 too, reaching e==0. */
236
237 while (blimb <= GMP_NUMB_HALFMAX)
238 {
239 TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
240 e, blimb, rl));
241 ASSERT (e != 0);
242 if ((e & 1) != 0)
243 rl *= blimb;
244 e >>= 1;
245 if (e == 0)
246 goto got_rl;
247 blimb *= blimb;
248 }
249
250 #if HAVE_NATIVE_mpn_mul_2
251 TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
252 e, blimb, rl));
253
254 /* Can power b once more into blimb:blimb_low */
255 bsize = 2;
256 ASSERT (e != 0);
257 if ((e & 1) != 0)
258 {
259 umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
260 rl >>= GMP_NAIL_BITS;
261 }
262 e >>= 1;
263 umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
264 blimb_low >>= GMP_NAIL_BITS;
265
266 got_rl:
267 TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
268 e, blimb, blimb_low, rl_high, rl));
269
270 /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
271 final mul_1 or mul_2 rather than a separate lshift.
272 - rl_high:rl mustn't be 1 (since then there's no final mul)
273 - rl_high mustn't overflow
274 - rl_high mustn't change to non-zero, since mul_1+lshift is
275 probably faster than mul_2 (FIXME: is this true?) */
276
277 if (rtwos_bits != 0
278 && ! (rl_high == 0 && rl == 1)
279 && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
280 {
281 mp_limb_t new_rl_high = (rl_high << rtwos_bits)
282 | (rl >> (GMP_NUMB_BITS-rtwos_bits));
283 if (! (rl_high == 0 && new_rl_high != 0))
284 {
285 rl_high = new_rl_high;
286 rl <<= rtwos_bits;
287 rtwos_bits = 0;
288 TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
289 rl_high, rl));
290 }
291 }
292 #else
293 got_rl:
294 TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
295 e, blimb, rl));
296
297 /* Combine left-over rtwos_bits into rl to be handled by the final
298 mul_1 rather than a separate lshift.
299 - rl mustn't be 1 (since then there's no final mul)
300 - rl mustn't overflow */
301
302 if (rtwos_bits != 0
303 && rl != 1
304 && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
305 {
306 rl <<= rtwos_bits;
307 rtwos_bits = 0;
308 TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
309 }
310 #endif
311 }
312 else if (bsize == 2)
313 {
314 mp_limb_t bsecond = bp[1];
315 if (btwos != 0)
316 blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
317 bsecond >>= btwos;
318 if (bsecond == 0)
319 {
320 /* Two limbs became one after rshift. */
321 bsize = 1;
322 goto bsize_1;
323 }
324
325 TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
326 #if HAVE_NATIVE_mpn_mul_2
327 blimb_low = blimb;
328 #else
329 bp = b_twolimbs;
330 b_twolimbs[0] = blimb;
331 b_twolimbs[1] = bsecond;
332 #endif
333 blimb = bsecond;
334 }
335 else
336 {
337 if (r_bp_overlap || btwos != 0)
338 {
339 mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
340 MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
341 bp = tp;
342 TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
343 }
344 #if HAVE_NATIVE_mpn_mul_2
345 /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
346 blimb_low = bp[0];
347 #endif
348 blimb = bp[bsize-1];
349
350 TRACE (printf ("big bsize=%ld ", bsize);
351 mpn_trace ("b", bp, bsize));
352 }
353
354 /* At this point blimb is the most significant limb of the base to use.
355
356 Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
357 limb to round up the division; +1 for multiplies all using an extra
358 limb over the true size; +2 for rl at the end; +1 for lshift at the
359 end.
360
361 The size calculation here is reasonably accurate. The base is at least
362 half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
363 when it will power up as just over 16, an overestimate of 17/16 =
364 6.25%. For a 64-bit limb it's half that.
365
366 If e==0 then blimb won't be anything useful (though it will be
367 non-zero), but that doesn't matter since we just end up with ralloc==5,
368 and that's fine for 2 limbs of rl and 1 of lshift. */
369
370 ASSERT (blimb != 0);
371 count_leading_zeros (cnt, blimb);
372 ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
373 TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
374 ralloc, bsize, blimb, cnt));
375 rp = MPZ_REALLOC (r, ralloc + rtwos_limbs);
376
377 /* Low zero limbs resulting from powers of 2. */
378 MPN_ZERO (rp, rtwos_limbs);
379 rp += rtwos_limbs;
380
381 if (e == 0)
382 {
383 /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
384 start. */
385 rp[0] = rl;
386 rsize = 1;
387 #if HAVE_NATIVE_mpn_mul_2
388 rp[1] = rl_high;
389 rsize += (rl_high != 0);
390 #endif
391 ASSERT (rp[rsize-1] != 0);
392 }
393 else
394 {
395 mp_ptr tp;
396 mp_size_t talloc;
397
398 /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
399 low bit of e is zero, tp only has to hold the second last power
400 step, which is half the size of the final result. There's no need
401 to round up the divide by 2, since ralloc includes a +2 for rl
402 which not needed by tp. In the mpn_mul loop when the low bit of e
403 is 1, tp must hold nearly the full result, so just size it the same
404 as rp. */
405
406 talloc = ralloc;
407 #if HAVE_NATIVE_mpn_mul_2
408 if (bsize <= 2 || (e & 1) == 0)
409 talloc /= 2;
410 #else
411 if (bsize <= 1 || (e & 1) == 0)
412 talloc /= 2;
413 #endif
414 TRACE (printf ("talloc %ld\n", talloc));
415 tp = TMP_ALLOC_LIMBS (talloc);
416
417 /* Go from high to low over the bits of e, starting with i pointing at
418 the bit below the highest 1 (which will mean i==-1 if e==1). */
419 count_leading_zeros (cnt, (mp_limb_t) e);
420 i = GMP_LIMB_BITS - cnt - 2;
421
422 #if HAVE_NATIVE_mpn_mul_2
423 if (bsize <= 2)
424 {
425 mp_limb_t mult[2];
426
427 /* Any bsize==1 will have been powered above to be two limbs. */
428 ASSERT (bsize == 2);
429 ASSERT (blimb != 0);
430
431 /* Arrange the final result ends up in r, not in the temp space */
432 if ((i & 1) == 0)
433 SWAP_RP_TP;
434
435 rp[0] = blimb_low;
436 rp[1] = blimb;
437 rsize = 2;
438
439 mult[0] = blimb_low;
440 mult[1] = blimb;
441
442 for ( ; i >= 0; i--)
443 {
444 TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
445 i, e, rsize, ralloc, talloc);
446 mpn_trace ("r", rp, rsize));
447
448 MPN_SQR (tp, talloc, rp, rsize);
449 SWAP_RP_TP;
450 if ((e & (1L << i)) != 0)
451 MPN_MUL_2 (rp, rsize, ralloc, mult);
452 }
453
454 TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
455 if (rl_high != 0)
456 {
457 mult[0] = rl;
458 mult[1] = rl_high;
459 MPN_MUL_2 (rp, rsize, ralloc, mult);
460 }
461 else if (rl != 1)
462 MPN_MUL_1 (rp, rsize, ralloc, rl);
463 }
464 #else
465 if (bsize == 1)
466 {
467 /* Arrange the final result ends up in r, not in the temp space */
468 if ((i & 1) == 0)
469 SWAP_RP_TP;
470
471 rp[0] = blimb;
472 rsize = 1;
473
474 for ( ; i >= 0; i--)
475 {
476 TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
477 i, e, rsize, ralloc, talloc);
478 mpn_trace ("r", rp, rsize));
479
480 MPN_SQR (tp, talloc, rp, rsize);
481 SWAP_RP_TP;
482 if ((e & (1L << i)) != 0)
483 MPN_MUL_1 (rp, rsize, ralloc, blimb);
484 }
485
486 TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
487 if (rl != 1)
488 MPN_MUL_1 (rp, rsize, ralloc, rl);
489 }
490 #endif
491 else
492 {
493 int parity;
494
495 /* Arrange the final result ends up in r, not in the temp space */
496 ULONG_PARITY (parity, e);
497 if (((parity ^ i) & 1) != 0)
498 SWAP_RP_TP;
499
500 MPN_COPY (rp, bp, bsize);
501 rsize = bsize;
502
503 for ( ; i >= 0; i--)
504 {
505 TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
506 i, e, rsize, ralloc, talloc);
507 mpn_trace ("r", rp, rsize));
508
509 MPN_SQR (tp, talloc, rp, rsize);
510 SWAP_RP_TP;
511 if ((e & (1L << i)) != 0)
512 {
513 MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
514 SWAP_RP_TP;
515 }
516 }
517 }
518 }
519
520 ASSERT (rp == PTR(r) + rtwos_limbs);
521 TRACE (mpn_trace ("end loop r", rp, rsize));
522 TMP_FREE;
523
524 /* Apply any partial limb factors of 2. */
525 if (rtwos_bits != 0)
526 {
527 MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
528 TRACE (mpn_trace ("lshift r", rp, rsize));
529 }
530
531 rsize += rtwos_limbs;
532 SIZ(r) = (rneg ? -rsize : rsize);
533 }
534