1 /* mpz_n_pow_ui -- mpn raised to ulong.
2
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
6
7 Copyright 2001, 2002, 2005, 2012, 2015, 2020 Free Software Foundation, Inc.
8
9 This file is part of the GNU MP Library.
10
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
13
14 * the GNU Lesser General Public License as published by the Free
15 Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
17
18 or
19
20 * the GNU General Public License as published by the Free Software
21 Foundation; either version 2 of the License, or (at your option) any
22 later version.
23
24 or both in parallel, as here.
25
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 for more details.
30
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library. If not,
33 see https://www.gnu.org/licenses/. */
34
35 #include <stdlib.h>
36 #include <stdio.h>
37 #include "gmp-impl.h"
38 #include "longlong.h"
39
40
41 /* Change this to "#define TRACE(x) x" for some traces. */
42 #define TRACE(x)
43
44
45 /* Use this to test the mul_2 code on a CPU without a native version of that
46 routine. */
47 #if 0
48 #define mpn_mul_2 refmpn_mul_2
49 #define HAVE_NATIVE_mpn_mul_2 1
50 #endif
51
52
53 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
54 ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
55 bsize==2 or >2, but separating that isn't easy because there's shared
56 code both before and after (the size calculations and the powers of 2
57 handling).
58
59 Alternatives:
60
61 It would work to just use the mpn_mul powering loop for 1 and 2 limb
62 bases, but the current separate loop allows mul_1 and mul_2 to be done
63 in-place, which might help cache locality a bit. If mpn_mul was relaxed
64 to allow source==dest when vn==1 or 2 then some pointer twiddling might
65 let us get the same effect in one loop.
66
67 The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
68 form the biggest possible power of b that fits, only the biggest power of
69 2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
70 using mp_bases[b].big_base for small b, and thereby get better value
71 from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
72 so would be more complicated than it's worth, and could well end up being
73 a slowdown for small e. For big e on the other hand the algorithm is
74 dominated by mpn_sqr so there wouldn't much of a saving. The current
75 code can be viewed as simply doing the first few steps of the powering in
76 a single or double limb where possible.
77
78 If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
79 copy made of b is unnecessary. We could just use the old alloc'ed block
80 and free it at the end. But arranging this seems like a lot more trouble
81 than it's worth. */
82
83
84 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
85 a limb without overflowing.
86 FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
87
88 #define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
89
90
91 /* The following are for convenience, they update the size and check the
92 alloc. */
93
94 #define MPN_SQR(dst, alloc, src, size) \
95 do { \
96 ASSERT (2*(size) <= (alloc)); \
97 mpn_sqr (dst, src, size); \
98 (size) *= 2; \
99 (size) -= ((dst)[(size)-1] == 0); \
100 } while (0)
101
102 #define MPN_MUL(dst, alloc, src, size, src2, size2) \
103 do { \
104 mp_limb_t cy; \
105 ASSERT ((size) + (size2) <= (alloc)); \
106 cy = mpn_mul (dst, src, size, src2, size2); \
107 (size) += (size2) - (cy == 0); \
108 } while (0)
109
110 #define MPN_MUL_2(ptr, size, alloc, mult) \
111 do { \
112 mp_limb_t cy; \
113 ASSERT ((size)+2 <= (alloc)); \
114 cy = mpn_mul_2 (ptr, ptr, size, mult); \
115 (size)++; \
116 (ptr)[(size)] = cy; \
117 (size) += (cy != 0); \
118 } while (0)
119
120 #define MPN_MUL_1(ptr, size, alloc, limb) \
121 do { \
122 mp_limb_t cy; \
123 ASSERT ((size)+1 <= (alloc)); \
124 cy = mpn_mul_1 (ptr, ptr, size, limb); \
125 (ptr)[size] = cy; \
126 (size) += (cy != 0); \
127 } while (0)
128
129 #define MPN_LSHIFT(ptr, size, alloc, shift) \
130 do { \
131 mp_limb_t cy; \
132 ASSERT ((size)+1 <= (alloc)); \
133 cy = mpn_lshift (ptr, ptr, size, shift); \
134 (ptr)[size] = cy; \
135 (size) += (cy != 0); \
136 } while (0)
137
138 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
139 do { \
140 if ((shift) == 0) \
141 MPN_COPY (dst, src, size); \
142 else \
143 { \
144 mpn_rshift (dst, src, size, shift); \
145 (size) -= ((dst)[(size)-1] == 0); \
146 } \
147 } while (0)
148
149
150 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
151 allocations. Avoid writing values to them in a normal build, to ensure
152 the compiler lets them go dead. gcc already figures this out itself
153 actually. */
154
155 #define SWAP_RP_TP \
156 do { \
157 MP_PTR_SWAP (rp, tp); \
158 ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
159 } while (0)
160
161
162 void
mpz_n_pow_ui(mpz_ptr r,mp_srcptr bp,mp_size_t bsize,unsigned long int e)163 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
164 {
165 mp_ptr rp;
166 mp_size_t rtwos_limbs, ralloc, rsize;
167 int rneg, i, cnt, btwos, r_bp_overlap;
168 mp_limb_t blimb, rl;
169 mp_bitcnt_t rtwos_bits;
170 #if HAVE_NATIVE_mpn_mul_2
171 mp_limb_t blimb_low, rl_high;
172 #else
173 mp_limb_t b_twolimbs[2];
174 #endif
175 mp_limb_t ovfl;
176 TMP_DECL;
177
178 TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
179 PTR(r), bp, bsize, e, e);
180 mpn_trace ("b", bp, bsize));
181
182 ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
183 ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
184
185 /* b^0 == 1, including 0^0 == 1 */
186 if (e == 0)
187 {
188 MPZ_NEWALLOC (r, 1)[0] = 1;
189 SIZ(r) = 1;
190 return;
191 }
192
193 /* 0^e == 0 apart from 0^0 above */
194 if (bsize == 0)
195 {
196 SIZ(r) = 0;
197 return;
198 }
199
200 /* Sign of the final result. */
201 rneg = (bsize < 0 && (e & 1) != 0);
202 bsize = ABS (bsize);
203 TRACE (printf ("rneg %d\n", rneg));
204
205 r_bp_overlap = (PTR(r) == bp);
206
207 /* Strip low zero limbs from b. */
208 rtwos_limbs = 0;
209 for (blimb = *bp; blimb == 0; blimb = *++bp)
210 {
211 rtwos_limbs += e;
212 bsize--; ASSERT (bsize >= 1);
213 }
214 TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
215
216 /* Strip low zero bits from b. */
217 count_trailing_zeros (btwos, blimb);
218 blimb >>= btwos;
219
220 umul_ppmm (ovfl, rtwos_bits, e, btwos);
221 if (ovfl)
222 {
223 fprintf (stderr, "gmp: overflow in mpz type\n");
224 abort ();
225 }
226
227 rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
228 rtwos_bits %= GMP_NUMB_BITS;
229 TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
230 btwos, rtwos_limbs, rtwos_bits));
231
232 TMP_MARK;
233
234 rl = 1;
235 #if HAVE_NATIVE_mpn_mul_2
236 rl_high = 0;
237 #endif
238
239 if (bsize == 1)
240 {
241 bsize_1:
242 /* Power up as far as possible within blimb. We start here with e!=0,
243 but if e is small then we might reach e==0 and the whole b^e in rl.
244 Notice this code works when blimb==1 too, reaching e==0. */
245
246 while (blimb <= GMP_NUMB_HALFMAX)
247 {
248 TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
249 e, blimb, rl));
250 ASSERT (e != 0);
251 if ((e & 1) != 0)
252 rl *= blimb;
253 e >>= 1;
254 if (e == 0)
255 goto got_rl;
256 blimb *= blimb;
257 }
258
259 #if HAVE_NATIVE_mpn_mul_2
260 TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
261 e, blimb, rl));
262
263 /* Can power b once more into blimb:blimb_low */
264 bsize = 2;
265 ASSERT (e != 0);
266 if ((e & 1) != 0)
267 {
268 umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
269 rl >>= GMP_NAIL_BITS;
270 }
271 e >>= 1;
272 umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
273 blimb_low >>= GMP_NAIL_BITS;
274
275 got_rl:
276 TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
277 e, blimb, blimb_low, rl_high, rl));
278
279 /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
280 final mul_1 or mul_2 rather than a separate lshift.
281 - rl_high:rl mustn't be 1 (since then there's no final mul)
282 - rl_high mustn't overflow
283 - rl_high mustn't change to non-zero, since mul_1+lshift is
284 probably faster than mul_2 (FIXME: is this true?) */
285
286 if (rtwos_bits != 0
287 && ! (rl_high == 0 && rl == 1)
288 && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
289 {
290 mp_limb_t new_rl_high = (rl_high << rtwos_bits)
291 | (rl >> (GMP_NUMB_BITS-rtwos_bits));
292 if (! (rl_high == 0 && new_rl_high != 0))
293 {
294 rl_high = new_rl_high;
295 rl <<= rtwos_bits;
296 rtwos_bits = 0;
297 TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
298 rl_high, rl));
299 }
300 }
301 #else
302 got_rl:
303 TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
304 e, blimb, rl));
305
306 /* Combine left-over rtwos_bits into rl to be handled by the final
307 mul_1 rather than a separate lshift.
308 - rl mustn't be 1 (since then there's no final mul)
309 - rl mustn't overflow */
310
311 if (rtwos_bits != 0
312 && rl != 1
313 && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
314 {
315 rl <<= rtwos_bits;
316 rtwos_bits = 0;
317 TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
318 }
319 #endif
320 }
321 else if (bsize == 2)
322 {
323 mp_limb_t bsecond = bp[1];
324 if (btwos != 0)
325 blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
326 bsecond >>= btwos;
327 if (bsecond == 0)
328 {
329 /* Two limbs became one after rshift. */
330 bsize = 1;
331 goto bsize_1;
332 }
333
334 TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
335 #if HAVE_NATIVE_mpn_mul_2
336 blimb_low = blimb;
337 #else
338 bp = b_twolimbs;
339 b_twolimbs[0] = blimb;
340 b_twolimbs[1] = bsecond;
341 #endif
342 blimb = bsecond;
343 }
344 else
345 {
346 if (r_bp_overlap || btwos != 0)
347 {
348 mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
349 MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
350 bp = tp;
351 TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
352 }
353 #if HAVE_NATIVE_mpn_mul_2
354 /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
355 blimb_low = bp[0];
356 #endif
357 blimb = bp[bsize-1];
358
359 TRACE (printf ("big bsize=%ld ", bsize);
360 mpn_trace ("b", bp, bsize));
361 }
362
363 /* At this point blimb is the most significant limb of the base to use.
364
365 Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
366 limb to round up the division; +1 for multiplies all using an extra
367 limb over the true size; +2 for rl at the end; +1 for lshift at the
368 end.
369
370 The size calculation here is reasonably accurate. The base is at least
371 half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
372 when it will power up as just over 16, an overestimate of 17/16 =
373 6.25%. For a 64-bit limb it's half that.
374
375 If e==0 then blimb won't be anything useful (though it will be
376 non-zero), but that doesn't matter since we just end up with ralloc==5,
377 and that's fine for 2 limbs of rl and 1 of lshift. */
378
379 ASSERT (blimb != 0);
380 count_leading_zeros (cnt, blimb);
381
382 umul_ppmm (ovfl, ralloc, (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS), e);
383 if (ovfl)
384 {
385 fprintf (stderr, "gmp: overflow in mpz type\n");
386 abort ();
387 }
388 ralloc = ralloc / GMP_NUMB_BITS + 5;
389
390 TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
391 ralloc, bsize, blimb, cnt));
392 rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);
393
394 /* Low zero limbs resulting from powers of 2. */
395 MPN_ZERO (rp, rtwos_limbs);
396 rp += rtwos_limbs;
397
398 if (e == 0)
399 {
400 /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
401 start. */
402 rp[0] = rl;
403 rsize = 1;
404 #if HAVE_NATIVE_mpn_mul_2
405 rp[1] = rl_high;
406 rsize += (rl_high != 0);
407 #endif
408 ASSERT (rp[rsize-1] != 0);
409 }
410 else
411 {
412 mp_ptr tp;
413 mp_size_t talloc;
414
415 /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
416 low bit of e is zero, tp only has to hold the second last power
417 step, which is half the size of the final result. There's no need
418 to round up the divide by 2, since ralloc includes a +2 for rl
419 which not needed by tp. In the mpn_mul loop when the low bit of e
420 is 1, tp must hold nearly the full result, so just size it the same
421 as rp. */
422
423 talloc = ralloc;
424 #if HAVE_NATIVE_mpn_mul_2
425 if (bsize <= 2 || (e & 1) == 0)
426 talloc /= 2;
427 #else
428 if (bsize <= 1 || (e & 1) == 0)
429 talloc /= 2;
430 #endif
431 TRACE (printf ("talloc %ld\n", talloc));
432 tp = TMP_ALLOC_LIMBS (talloc);
433
434 /* Go from high to low over the bits of e, starting with i pointing at
435 the bit below the highest 1 (which will mean i==-1 if e==1). */
436 count_leading_zeros (cnt, (mp_limb_t) e);
437 i = GMP_LIMB_BITS - cnt - 2;
438
439 #if HAVE_NATIVE_mpn_mul_2
440 if (bsize <= 2)
441 {
442 mp_limb_t mult[2];
443
444 /* Any bsize==1 will have been powered above to be two limbs. */
445 ASSERT (bsize == 2);
446 ASSERT (blimb != 0);
447
448 /* Arrange the final result ends up in r, not in the temp space */
449 if ((i & 1) == 0)
450 SWAP_RP_TP;
451
452 rp[0] = blimb_low;
453 rp[1] = blimb;
454 rsize = 2;
455
456 mult[0] = blimb_low;
457 mult[1] = blimb;
458
459 for ( ; i >= 0; i--)
460 {
461 TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
462 i, e, rsize, ralloc, talloc);
463 mpn_trace ("r", rp, rsize));
464
465 MPN_SQR (tp, talloc, rp, rsize);
466 SWAP_RP_TP;
467 if ((e & (1L << i)) != 0)
468 MPN_MUL_2 (rp, rsize, ralloc, mult);
469 }
470
471 TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
472 if (rl_high != 0)
473 {
474 mult[0] = rl;
475 mult[1] = rl_high;
476 MPN_MUL_2 (rp, rsize, ralloc, mult);
477 }
478 else if (rl != 1)
479 MPN_MUL_1 (rp, rsize, ralloc, rl);
480 }
481 #else
482 if (bsize == 1)
483 {
484 /* Arrange the final result ends up in r, not in the temp space */
485 if ((i & 1) == 0)
486 SWAP_RP_TP;
487
488 rp[0] = blimb;
489 rsize = 1;
490
491 for ( ; i >= 0; i--)
492 {
493 TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
494 i, e, rsize, ralloc, talloc);
495 mpn_trace ("r", rp, rsize));
496
497 MPN_SQR (tp, talloc, rp, rsize);
498 SWAP_RP_TP;
499 if ((e & (1L << i)) != 0)
500 MPN_MUL_1 (rp, rsize, ralloc, blimb);
501 }
502
503 TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
504 if (rl != 1)
505 MPN_MUL_1 (rp, rsize, ralloc, rl);
506 }
507 #endif
508 else
509 {
510 int parity;
511
512 /* Arrange the final result ends up in r, not in the temp space */
513 ULONG_PARITY (parity, e);
514 if (((parity ^ i) & 1) != 0)
515 SWAP_RP_TP;
516
517 MPN_COPY (rp, bp, bsize);
518 rsize = bsize;
519
520 for ( ; i >= 0; i--)
521 {
522 TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
523 i, e, rsize, ralloc, talloc);
524 mpn_trace ("r", rp, rsize));
525
526 MPN_SQR (tp, talloc, rp, rsize);
527 SWAP_RP_TP;
528 if ((e & (1L << i)) != 0)
529 {
530 MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
531 SWAP_RP_TP;
532 }
533 }
534 }
535 }
536
537 ASSERT (rp == PTR(r) + rtwos_limbs);
538 TRACE (mpn_trace ("end loop r", rp, rsize));
539 TMP_FREE;
540
541 /* Apply any partial limb factors of 2. */
542 if (rtwos_bits != 0)
543 {
544 MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
545 TRACE (mpn_trace ("lshift r", rp, rsize));
546 }
547
548 rsize += rtwos_limbs;
549 SIZ(r) = (rneg ? -rsize : rsize);
550 }
551