xref: /netbsd/external/lgpl3/gmp/dist/mpz/n_pow_ui.c (revision 122966f8)
1 /* mpz_n_pow_ui -- mpn raised to ulong.
2 
3    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
4    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5    FUTURE GNU MP RELEASES.
6 
7 Copyright 2001, 2002, 2005, 2012, 2015, 2020 Free Software Foundation, Inc.
8 
9 This file is part of the GNU MP Library.
10 
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
13 
14   * the GNU Lesser General Public License as published by the Free
15     Software Foundation; either version 3 of the License, or (at your
16     option) any later version.
17 
18 or
19 
20   * the GNU General Public License as published by the Free Software
21     Foundation; either version 2 of the License, or (at your option) any
22     later version.
23 
24 or both in parallel, as here.
25 
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29 for more details.
30 
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library.  If not,
33 see https://www.gnu.org/licenses/.  */
34 
35 #include <stdlib.h>
36 #include <stdio.h>
37 #include "gmp-impl.h"
38 #include "longlong.h"
39 
40 
41 /* Change this to "#define TRACE(x) x" for some traces. */
42 #define TRACE(x)
43 
44 
45 /* Use this to test the mul_2 code on a CPU without a native version of that
46    routine.  */
47 #if 0
48 #define mpn_mul_2  refmpn_mul_2
49 #define HAVE_NATIVE_mpn_mul_2  1
50 #endif
51 
52 
53 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
54    ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
55    bsize==2 or >2, but separating that isn't easy because there's shared
56    code both before and after (the size calculations and the powers of 2
57    handling).
58 
59    Alternatives:
60 
61    It would work to just use the mpn_mul powering loop for 1 and 2 limb
62    bases, but the current separate loop allows mul_1 and mul_2 to be done
63    in-place, which might help cache locality a bit.  If mpn_mul was relaxed
64    to allow source==dest when vn==1 or 2 then some pointer twiddling might
65    let us get the same effect in one loop.
66 
67    The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
68    form the biggest possible power of b that fits, only the biggest power of
69    2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
70    using mp_bases[b].big_base for small b, and thereby get better value
71    from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
72    so would be more complicated than it's worth, and could well end up being
73    a slowdown for small e.  For big e on the other hand the algorithm is
74    dominated by mpn_sqr so there wouldn't much of a saving.  The current
75    code can be viewed as simply doing the first few steps of the powering in
76    a single or double limb where possible.
77 
78    If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
79    copy made of b is unnecessary.  We could just use the old alloc'ed block
80    and free it at the end.  But arranging this seems like a lot more trouble
81    than it's worth.  */
82 
83 
84 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
85    a limb without overflowing.
86    FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
87 
88 #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
89 
90 
91 /* The following are for convenience, they update the size and check the
92    alloc.  */
93 
94 #define MPN_SQR(dst, alloc, src, size)          \
95   do {                                          \
96     ASSERT (2*(size) <= (alloc));               \
97     mpn_sqr (dst, src, size);                   \
98     (size) *= 2;                                \
99     (size) -= ((dst)[(size)-1] == 0);           \
100   } while (0)
101 
102 #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
103   do {                                                  \
104     mp_limb_t  cy;                                      \
105     ASSERT ((size) + (size2) <= (alloc));               \
106     cy = mpn_mul (dst, src, size, src2, size2);         \
107     (size) += (size2) - (cy == 0);                      \
108   } while (0)
109 
110 #define MPN_MUL_2(ptr, size, alloc, mult)       \
111   do {                                          \
112     mp_limb_t  cy;                              \
113     ASSERT ((size)+2 <= (alloc));               \
114     cy = mpn_mul_2 (ptr, ptr, size, mult);      \
115     (size)++;                                   \
116     (ptr)[(size)] = cy;                         \
117     (size) += (cy != 0);                        \
118   } while (0)
119 
120 #define MPN_MUL_1(ptr, size, alloc, limb)       \
121   do {                                          \
122     mp_limb_t  cy;                              \
123     ASSERT ((size)+1 <= (alloc));               \
124     cy = mpn_mul_1 (ptr, ptr, size, limb);      \
125     (ptr)[size] = cy;                           \
126     (size) += (cy != 0);                        \
127   } while (0)
128 
129 #define MPN_LSHIFT(ptr, size, alloc, shift)     \
130   do {                                          \
131     mp_limb_t  cy;                              \
132     ASSERT ((size)+1 <= (alloc));               \
133     cy = mpn_lshift (ptr, ptr, size, shift);    \
134     (ptr)[size] = cy;                           \
135     (size) += (cy != 0);                        \
136   } while (0)
137 
138 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
139   do {                                                  \
140     if ((shift) == 0)                                   \
141       MPN_COPY (dst, src, size);                        \
142     else                                                \
143       {                                                 \
144         mpn_rshift (dst, src, size, shift);             \
145         (size) -= ((dst)[(size)-1] == 0);               \
146       }                                                 \
147   } while (0)
148 
149 
150 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
151    allocations.  Avoid writing values to them in a normal build, to ensure
152    the compiler lets them go dead.  gcc already figures this out itself
153    actually.  */
154 
155 #define SWAP_RP_TP                                      \
156   do {                                                  \
157     MP_PTR_SWAP (rp, tp);                               \
158     ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
159   } while (0)
160 
161 
162 void
mpz_n_pow_ui(mpz_ptr r,mp_srcptr bp,mp_size_t bsize,unsigned long int e)163 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
164 {
165   mp_ptr         rp;
166   mp_size_t      rtwos_limbs, ralloc, rsize;
167   int            rneg, i, cnt, btwos, r_bp_overlap;
168   mp_limb_t      blimb, rl;
169   mp_bitcnt_t    rtwos_bits;
170 #if HAVE_NATIVE_mpn_mul_2
171   mp_limb_t      blimb_low, rl_high;
172 #else
173   mp_limb_t      b_twolimbs[2];
174 #endif
175   mp_limb_t ovfl;
176   TMP_DECL;
177 
178   TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
179 		 PTR(r), bp, bsize, e, e);
180 	 mpn_trace ("b", bp, bsize));
181 
182   ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
183   ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
184 
185   /* b^0 == 1, including 0^0 == 1 */
186   if (e == 0)
187     {
188       MPZ_NEWALLOC (r, 1)[0] = 1;
189       SIZ(r) = 1;
190       return;
191     }
192 
193   /* 0^e == 0 apart from 0^0 above */
194   if (bsize == 0)
195     {
196       SIZ(r) = 0;
197       return;
198     }
199 
200   /* Sign of the final result. */
201   rneg = (bsize < 0 && (e & 1) != 0);
202   bsize = ABS (bsize);
203   TRACE (printf ("rneg %d\n", rneg));
204 
205   r_bp_overlap = (PTR(r) == bp);
206 
207   /* Strip low zero limbs from b. */
208   rtwos_limbs = 0;
209   for (blimb = *bp; blimb == 0; blimb = *++bp)
210     {
211       rtwos_limbs += e;
212       bsize--; ASSERT (bsize >= 1);
213     }
214   TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
215 
216   /* Strip low zero bits from b. */
217   count_trailing_zeros (btwos, blimb);
218   blimb >>= btwos;
219 
220   umul_ppmm (ovfl, rtwos_bits, e, btwos);
221   if (ovfl)
222     {
223       fprintf (stderr, "gmp: overflow in mpz type\n");
224       abort ();
225     }
226 
227   rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
228   rtwos_bits %= GMP_NUMB_BITS;
229   TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
230 		 btwos, rtwos_limbs, rtwos_bits));
231 
232   TMP_MARK;
233 
234   rl = 1;
235 #if HAVE_NATIVE_mpn_mul_2
236   rl_high = 0;
237 #endif
238 
239   if (bsize == 1)
240     {
241     bsize_1:
242       /* Power up as far as possible within blimb.  We start here with e!=0,
243 	 but if e is small then we might reach e==0 and the whole b^e in rl.
244 	 Notice this code works when blimb==1 too, reaching e==0.  */
245 
246       while (blimb <= GMP_NUMB_HALFMAX)
247 	{
248 	  TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
249 			 e, blimb, rl));
250 	  ASSERT (e != 0);
251 	  if ((e & 1) != 0)
252 	    rl *= blimb;
253 	  e >>= 1;
254 	  if (e == 0)
255 	    goto got_rl;
256 	  blimb *= blimb;
257 	}
258 
259 #if HAVE_NATIVE_mpn_mul_2
260       TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
261 		     e, blimb, rl));
262 
263       /* Can power b once more into blimb:blimb_low */
264       bsize = 2;
265       ASSERT (e != 0);
266       if ((e & 1) != 0)
267 	{
268 	  umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
269 	  rl >>= GMP_NAIL_BITS;
270 	}
271       e >>= 1;
272       umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
273       blimb_low >>= GMP_NAIL_BITS;
274 
275     got_rl:
276       TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
277 		     e, blimb, blimb_low, rl_high, rl));
278 
279       /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
280 	 final mul_1 or mul_2 rather than a separate lshift.
281 	 - rl_high:rl mustn't be 1 (since then there's no final mul)
282 	 - rl_high mustn't overflow
283 	 - rl_high mustn't change to non-zero, since mul_1+lshift is
284 	 probably faster than mul_2 (FIXME: is this true?)  */
285 
286       if (rtwos_bits != 0
287 	  && ! (rl_high == 0 && rl == 1)
288 	  && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
289 	{
290 	  mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
291 	    | (rl >> (GMP_NUMB_BITS-rtwos_bits));
292 	  if (! (rl_high == 0 && new_rl_high != 0))
293 	    {
294 	      rl_high = new_rl_high;
295 	      rl <<= rtwos_bits;
296 	      rtwos_bits = 0;
297 	      TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
298 			     rl_high, rl));
299 	    }
300 	}
301 #else
302     got_rl:
303       TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
304 		     e, blimb, rl));
305 
306       /* Combine left-over rtwos_bits into rl to be handled by the final
307 	 mul_1 rather than a separate lshift.
308 	 - rl mustn't be 1 (since then there's no final mul)
309 	 - rl mustn't overflow	*/
310 
311       if (rtwos_bits != 0
312 	  && rl != 1
313 	  && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
314 	{
315 	  rl <<= rtwos_bits;
316 	  rtwos_bits = 0;
317 	  TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
318 	}
319 #endif
320     }
321   else if (bsize == 2)
322     {
323       mp_limb_t  bsecond = bp[1];
324       if (btwos != 0)
325 	blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
326       bsecond >>= btwos;
327       if (bsecond == 0)
328 	{
329 	  /* Two limbs became one after rshift. */
330 	  bsize = 1;
331 	  goto bsize_1;
332 	}
333 
334       TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
335 #if HAVE_NATIVE_mpn_mul_2
336       blimb_low = blimb;
337 #else
338       bp = b_twolimbs;
339       b_twolimbs[0] = blimb;
340       b_twolimbs[1] = bsecond;
341 #endif
342       blimb = bsecond;
343     }
344   else
345     {
346       if (r_bp_overlap || btwos != 0)
347 	{
348 	  mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
349 	  MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
350 	  bp = tp;
351 	  TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
352 	}
353 #if HAVE_NATIVE_mpn_mul_2
354       /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
355       blimb_low = bp[0];
356 #endif
357       blimb = bp[bsize-1];
358 
359       TRACE (printf ("big bsize=%ld  ", bsize);
360 	     mpn_trace ("b", bp, bsize));
361     }
362 
363   /* At this point blimb is the most significant limb of the base to use.
364 
365      Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
366      limb to round up the division; +1 for multiplies all using an extra
367      limb over the true size; +2 for rl at the end; +1 for lshift at the
368      end.
369 
370      The size calculation here is reasonably accurate.  The base is at least
371      half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
372      when it will power up as just over 16, an overestimate of 17/16 =
373      6.25%.  For a 64-bit limb it's half that.
374 
375      If e==0 then blimb won't be anything useful (though it will be
376      non-zero), but that doesn't matter since we just end up with ralloc==5,
377      and that's fine for 2 limbs of rl and 1 of lshift.  */
378 
379   ASSERT (blimb != 0);
380   count_leading_zeros (cnt, blimb);
381 
382   umul_ppmm (ovfl, ralloc, (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS), e);
383   if (ovfl)
384     {
385       fprintf (stderr, "gmp: overflow in mpz type\n");
386       abort ();
387     }
388   ralloc = ralloc / GMP_NUMB_BITS + 5;
389 
390   TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
391 		 ralloc, bsize, blimb, cnt));
392   rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);
393 
394   /* Low zero limbs resulting from powers of 2. */
395   MPN_ZERO (rp, rtwos_limbs);
396   rp += rtwos_limbs;
397 
398   if (e == 0)
399     {
400       /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
401 	 start. */
402       rp[0] = rl;
403       rsize = 1;
404 #if HAVE_NATIVE_mpn_mul_2
405       rp[1] = rl_high;
406       rsize += (rl_high != 0);
407 #endif
408       ASSERT (rp[rsize-1] != 0);
409     }
410   else
411     {
412       mp_ptr     tp;
413       mp_size_t  talloc;
414 
415       /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
416 	 low bit of e is zero, tp only has to hold the second last power
417 	 step, which is half the size of the final result.  There's no need
418 	 to round up the divide by 2, since ralloc includes a +2 for rl
419 	 which not needed by tp.  In the mpn_mul loop when the low bit of e
420 	 is 1, tp must hold nearly the full result, so just size it the same
421 	 as rp.  */
422 
423       talloc = ralloc;
424 #if HAVE_NATIVE_mpn_mul_2
425       if (bsize <= 2 || (e & 1) == 0)
426 	talloc /= 2;
427 #else
428       if (bsize <= 1 || (e & 1) == 0)
429 	talloc /= 2;
430 #endif
431       TRACE (printf ("talloc %ld\n", talloc));
432       tp = TMP_ALLOC_LIMBS (talloc);
433 
434       /* Go from high to low over the bits of e, starting with i pointing at
435 	 the bit below the highest 1 (which will mean i==-1 if e==1).  */
436       count_leading_zeros (cnt, (mp_limb_t) e);
437       i = GMP_LIMB_BITS - cnt - 2;
438 
439 #if HAVE_NATIVE_mpn_mul_2
440       if (bsize <= 2)
441 	{
442 	  mp_limb_t  mult[2];
443 
444 	  /* Any bsize==1 will have been powered above to be two limbs. */
445 	  ASSERT (bsize == 2);
446 	  ASSERT (blimb != 0);
447 
448 	  /* Arrange the final result ends up in r, not in the temp space */
449 	  if ((i & 1) == 0)
450 	    SWAP_RP_TP;
451 
452 	  rp[0] = blimb_low;
453 	  rp[1] = blimb;
454 	  rsize = 2;
455 
456 	  mult[0] = blimb_low;
457 	  mult[1] = blimb;
458 
459 	  for ( ; i >= 0; i--)
460 	    {
461 	      TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
462 			     i, e, rsize, ralloc, talloc);
463 		     mpn_trace ("r", rp, rsize));
464 
465 	      MPN_SQR (tp, talloc, rp, rsize);
466 	      SWAP_RP_TP;
467 	      if ((e & (1L << i)) != 0)
468 		MPN_MUL_2 (rp, rsize, ralloc, mult);
469 	    }
470 
471 	  TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
472 	  if (rl_high != 0)
473 	    {
474 	      mult[0] = rl;
475 	      mult[1] = rl_high;
476 	      MPN_MUL_2 (rp, rsize, ralloc, mult);
477 	    }
478 	  else if (rl != 1)
479 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
480 	}
481 #else
482       if (bsize == 1)
483 	{
484 	  /* Arrange the final result ends up in r, not in the temp space */
485 	  if ((i & 1) == 0)
486 	    SWAP_RP_TP;
487 
488 	  rp[0] = blimb;
489 	  rsize = 1;
490 
491 	  for ( ; i >= 0; i--)
492 	    {
493 	      TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
494 			     i, e, rsize, ralloc, talloc);
495 		     mpn_trace ("r", rp, rsize));
496 
497 	      MPN_SQR (tp, talloc, rp, rsize);
498 	      SWAP_RP_TP;
499 	      if ((e & (1L << i)) != 0)
500 		MPN_MUL_1 (rp, rsize, ralloc, blimb);
501 	    }
502 
503 	  TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
504 	  if (rl != 1)
505 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
506 	}
507 #endif
508       else
509 	{
510 	  int  parity;
511 
512 	  /* Arrange the final result ends up in r, not in the temp space */
513 	  ULONG_PARITY (parity, e);
514 	  if (((parity ^ i) & 1) != 0)
515 	    SWAP_RP_TP;
516 
517 	  MPN_COPY (rp, bp, bsize);
518 	  rsize = bsize;
519 
520 	  for ( ; i >= 0; i--)
521 	    {
522 	      TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
523 			     i, e, rsize, ralloc, talloc);
524 		     mpn_trace ("r", rp, rsize));
525 
526 	      MPN_SQR (tp, talloc, rp, rsize);
527 	      SWAP_RP_TP;
528 	      if ((e & (1L << i)) != 0)
529 		{
530 		  MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
531 		  SWAP_RP_TP;
532 		}
533 	    }
534 	}
535     }
536 
537   ASSERT (rp == PTR(r) + rtwos_limbs);
538   TRACE (mpn_trace ("end loop r", rp, rsize));
539   TMP_FREE;
540 
541   /* Apply any partial limb factors of 2. */
542   if (rtwos_bits != 0)
543     {
544       MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
545       TRACE (mpn_trace ("lshift r", rp, rsize));
546     }
547 
548   rsize += rtwos_limbs;
549   SIZ(r) = (rneg ? -rsize : rsize);
550 }
551