1 /* mpz_n_pow_ui -- mpn raised to ulong.
2 
3    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
4    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5    FUTURE GNU MP RELEASES.
6 
7 Copyright 2001, 2002, 2005, 2012 Free Software Foundation, Inc.
8 
9 This file is part of the GNU MP Library.
10 
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
13 
14   * the GNU Lesser General Public License as published by the Free
15     Software Foundation; either version 3 of the License, or (at your
16     option) any later version.
17 
18 or
19 
20   * the GNU General Public License as published by the Free Software
21     Foundation; either version 2 of the License, or (at your option) any
22     later version.
23 
24 or both in parallel, as here.
25 
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29 for more details.
30 
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library.  If not,
33 see https://www.gnu.org/licenses/.  */
34 
35 #include "gmp.h"
36 #include "gmp-impl.h"
37 #include "longlong.h"
38 
39 
40 /* Change this to "#define TRACE(x) x" for some traces. */
41 #define TRACE(x)
42 
43 
44 /* Use this to test the mul_2 code on a CPU without a native version of that
45    routine.  */
46 #if 0
47 #define mpn_mul_2  refmpn_mul_2
48 #define HAVE_NATIVE_mpn_mul_2  1
49 #endif
50 
51 
52 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
53    ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
54    bsize==2 or >2, but separating that isn't easy because there's shared
55    code both before and after (the size calculations and the powers of 2
56    handling).
57 
58    Alternatives:
59 
60    It would work to just use the mpn_mul powering loop for 1 and 2 limb
61    bases, but the current separate loop allows mul_1 and mul_2 to be done
62    in-place, which might help cache locality a bit.  If mpn_mul was relaxed
63    to allow source==dest when vn==1 or 2 then some pointer twiddling might
64    let us get the same effect in one loop.
65 
66    The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
67    form the biggest possible power of b that fits, only the biggest power of
68    2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
69    using mp_bases[b].big_base for small b, and thereby get better value
70    from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
71    so would be more complicated than it's worth, and could well end up being
72    a slowdown for small e.  For big e on the other hand the algorithm is
73    dominated by mpn_sqr so there wouldn't much of a saving.  The current
74    code can be viewed as simply doing the first few steps of the powering in
75    a single or double limb where possible.
76 
77    If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
78    copy made of b is unnecessary.  We could just use the old alloc'ed block
79    and free it at the end.  But arranging this seems like a lot more trouble
80    than it's worth.  */
81 
82 
83 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
84    a limb without overflowing.
85    FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
86 
87 #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
88 
89 
90 /* The following are for convenience, they update the size and check the
91    alloc.  */
92 
93 #define MPN_SQR(dst, alloc, src, size)          \
94   do {                                          \
95     ASSERT (2*(size) <= (alloc));               \
96     mpn_sqr (dst, src, size);                   \
97     (size) *= 2;                                \
98     (size) -= ((dst)[(size)-1] == 0);           \
99   } while (0)
100 
101 #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
102   do {                                                  \
103     mp_limb_t  cy;                                      \
104     ASSERT ((size) + (size2) <= (alloc));               \
105     cy = mpn_mul (dst, src, size, src2, size2);         \
106     (size) += (size2) - (cy == 0);                      \
107   } while (0)
108 
109 #define MPN_MUL_2(ptr, size, alloc, mult)       \
110   do {                                          \
111     mp_limb_t  cy;                              \
112     ASSERT ((size)+2 <= (alloc));               \
113     cy = mpn_mul_2 (ptr, ptr, size, mult);      \
114     (size)++;                                   \
115     (ptr)[(size)] = cy;                         \
116     (size) += (cy != 0);                        \
117   } while (0)
118 
119 #define MPN_MUL_1(ptr, size, alloc, limb)       \
120   do {                                          \
121     mp_limb_t  cy;                              \
122     ASSERT ((size)+1 <= (alloc));               \
123     cy = mpn_mul_1 (ptr, ptr, size, limb);      \
124     (ptr)[size] = cy;                           \
125     (size) += (cy != 0);                        \
126   } while (0)
127 
128 #define MPN_LSHIFT(ptr, size, alloc, shift)     \
129   do {                                          \
130     mp_limb_t  cy;                              \
131     ASSERT ((size)+1 <= (alloc));               \
132     cy = mpn_lshift (ptr, ptr, size, shift);    \
133     (ptr)[size] = cy;                           \
134     (size) += (cy != 0);                        \
135   } while (0)
136 
137 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
138   do {                                                  \
139     if ((shift) == 0)                                   \
140       MPN_COPY (dst, src, size);                        \
141     else                                                \
142       {                                                 \
143         mpn_rshift (dst, src, size, shift);             \
144         (size) -= ((dst)[(size)-1] == 0);               \
145       }                                                 \
146   } while (0)
147 
148 
149 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
150    allocations.  Avoid writing values to them in a normal build, to ensure
151    the compiler lets them go dead.  gcc already figures this out itself
152    actually.  */
153 
154 #define SWAP_RP_TP                                      \
155   do {                                                  \
156     MP_PTR_SWAP (rp, tp);                               \
157     ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
158   } while (0)
159 
160 
161 void
mpz_n_pow_ui(mpz_ptr r,mp_srcptr bp,mp_size_t bsize,unsigned long int e)162 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
163 {
164   mp_ptr         rp;
165   mp_size_t      rtwos_limbs, ralloc, rsize;
166   int            rneg, i, cnt, btwos, r_bp_overlap;
167   mp_limb_t      blimb, rl;
168   mp_bitcnt_t    rtwos_bits;
169 #if HAVE_NATIVE_mpn_mul_2
170   mp_limb_t      blimb_low, rl_high;
171 #else
172   mp_limb_t      b_twolimbs[2];
173 #endif
174   TMP_DECL;
175 
176   TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
177 		 PTR(r), bp, bsize, e, e);
178 	 mpn_trace ("b", bp, bsize));
179 
180   ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
181   ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
182 
183   /* b^0 == 1, including 0^0 == 1 */
184   if (e == 0)
185     {
186       PTR(r)[0] = 1;
187       SIZ(r) = 1;
188       return;
189     }
190 
191   /* 0^e == 0 apart from 0^0 above */
192   if (bsize == 0)
193     {
194       SIZ(r) = 0;
195       return;
196     }
197 
198   /* Sign of the final result. */
199   rneg = (bsize < 0 && (e & 1) != 0);
200   bsize = ABS (bsize);
201   TRACE (printf ("rneg %d\n", rneg));
202 
203   r_bp_overlap = (PTR(r) == bp);
204 
205   /* Strip low zero limbs from b. */
206   rtwos_limbs = 0;
207   for (blimb = *bp; blimb == 0; blimb = *++bp)
208     {
209       rtwos_limbs += e;
210       bsize--; ASSERT (bsize >= 1);
211     }
212   TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
213 
214   /* Strip low zero bits from b. */
215   count_trailing_zeros (btwos, blimb);
216   blimb >>= btwos;
217   rtwos_bits = e * btwos;
218   rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
219   rtwos_bits %= GMP_NUMB_BITS;
220   TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
221 		 btwos, rtwos_limbs, rtwos_bits));
222 
223   TMP_MARK;
224 
225   rl = 1;
226 #if HAVE_NATIVE_mpn_mul_2
227   rl_high = 0;
228 #endif
229 
230   if (bsize == 1)
231     {
232     bsize_1:
233       /* Power up as far as possible within blimb.  We start here with e!=0,
234 	 but if e is small then we might reach e==0 and the whole b^e in rl.
235 	 Notice this code works when blimb==1 too, reaching e==0.  */
236 
237       while (blimb <= GMP_NUMB_HALFMAX)
238 	{
239 	  TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
240 			 e, blimb, rl));
241 	  ASSERT (e != 0);
242 	  if ((e & 1) != 0)
243 	    rl *= blimb;
244 	  e >>= 1;
245 	  if (e == 0)
246 	    goto got_rl;
247 	  blimb *= blimb;
248 	}
249 
250 #if HAVE_NATIVE_mpn_mul_2
251       TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
252 		     e, blimb, rl));
253 
254       /* Can power b once more into blimb:blimb_low */
255       bsize = 2;
256       ASSERT (e != 0);
257       if ((e & 1) != 0)
258 	{
259 	  umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
260 	  rl >>= GMP_NAIL_BITS;
261 	}
262       e >>= 1;
263       umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
264       blimb_low >>= GMP_NAIL_BITS;
265 
266     got_rl:
267       TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
268 		     e, blimb, blimb_low, rl_high, rl));
269 
270       /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
271 	 final mul_1 or mul_2 rather than a separate lshift.
272 	 - rl_high:rl mustn't be 1 (since then there's no final mul)
273 	 - rl_high mustn't overflow
274 	 - rl_high mustn't change to non-zero, since mul_1+lshift is
275 	 probably faster than mul_2 (FIXME: is this true?)  */
276 
277       if (rtwos_bits != 0
278 	  && ! (rl_high == 0 && rl == 1)
279 	  && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
280 	{
281 	  mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
282 	    | (rl >> (GMP_NUMB_BITS-rtwos_bits));
283 	  if (! (rl_high == 0 && new_rl_high != 0))
284 	    {
285 	      rl_high = new_rl_high;
286 	      rl <<= rtwos_bits;
287 	      rtwos_bits = 0;
288 	      TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
289 			     rl_high, rl));
290 	    }
291 	}
292 #else
293     got_rl:
294       TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
295 		     e, blimb, rl));
296 
297       /* Combine left-over rtwos_bits into rl to be handled by the final
298 	 mul_1 rather than a separate lshift.
299 	 - rl mustn't be 1 (since then there's no final mul)
300 	 - rl mustn't overflow	*/
301 
302       if (rtwos_bits != 0
303 	  && rl != 1
304 	  && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
305 	{
306 	  rl <<= rtwos_bits;
307 	  rtwos_bits = 0;
308 	  TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
309 	}
310 #endif
311     }
312   else if (bsize == 2)
313     {
314       mp_limb_t  bsecond = bp[1];
315       if (btwos != 0)
316 	blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
317       bsecond >>= btwos;
318       if (bsecond == 0)
319 	{
320 	  /* Two limbs became one after rshift. */
321 	  bsize = 1;
322 	  goto bsize_1;
323 	}
324 
325       TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
326 #if HAVE_NATIVE_mpn_mul_2
327       blimb_low = blimb;
328 #else
329       bp = b_twolimbs;
330       b_twolimbs[0] = blimb;
331       b_twolimbs[1] = bsecond;
332 #endif
333       blimb = bsecond;
334     }
335   else
336     {
337       if (r_bp_overlap || btwos != 0)
338 	{
339 	  mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
340 	  MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
341 	  bp = tp;
342 	  TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
343 	}
344 #if HAVE_NATIVE_mpn_mul_2
345       /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
346       blimb_low = bp[0];
347 #endif
348       blimb = bp[bsize-1];
349 
350       TRACE (printf ("big bsize=%ld  ", bsize);
351 	     mpn_trace ("b", bp, bsize));
352     }
353 
354   /* At this point blimb is the most significant limb of the base to use.
355 
356      Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
357      limb to round up the division; +1 for multiplies all using an extra
358      limb over the true size; +2 for rl at the end; +1 for lshift at the
359      end.
360 
361      The size calculation here is reasonably accurate.  The base is at least
362      half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
363      when it will power up as just over 16, an overestimate of 17/16 =
364      6.25%.  For a 64-bit limb it's half that.
365 
366      If e==0 then blimb won't be anything useful (though it will be
367      non-zero), but that doesn't matter since we just end up with ralloc==5,
368      and that's fine for 2 limbs of rl and 1 of lshift.  */
369 
370   ASSERT (blimb != 0);
371   count_leading_zeros (cnt, blimb);
372   ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
373   TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
374 		 ralloc, bsize, blimb, cnt));
375   rp = MPZ_REALLOC (r, ralloc + rtwos_limbs);
376 
377   /* Low zero limbs resulting from powers of 2. */
378   MPN_ZERO (rp, rtwos_limbs);
379   rp += rtwos_limbs;
380 
381   if (e == 0)
382     {
383       /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
384 	 start. */
385       rp[0] = rl;
386       rsize = 1;
387 #if HAVE_NATIVE_mpn_mul_2
388       rp[1] = rl_high;
389       rsize += (rl_high != 0);
390 #endif
391       ASSERT (rp[rsize-1] != 0);
392     }
393   else
394     {
395       mp_ptr     tp;
396       mp_size_t  talloc;
397 
398       /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
399 	 low bit of e is zero, tp only has to hold the second last power
400 	 step, which is half the size of the final result.  There's no need
401 	 to round up the divide by 2, since ralloc includes a +2 for rl
402 	 which not needed by tp.  In the mpn_mul loop when the low bit of e
403 	 is 1, tp must hold nearly the full result, so just size it the same
404 	 as rp.  */
405 
406       talloc = ralloc;
407 #if HAVE_NATIVE_mpn_mul_2
408       if (bsize <= 2 || (e & 1) == 0)
409 	talloc /= 2;
410 #else
411       if (bsize <= 1 || (e & 1) == 0)
412 	talloc /= 2;
413 #endif
414       TRACE (printf ("talloc %ld\n", talloc));
415       tp = TMP_ALLOC_LIMBS (talloc);
416 
417       /* Go from high to low over the bits of e, starting with i pointing at
418 	 the bit below the highest 1 (which will mean i==-1 if e==1).  */
419       count_leading_zeros (cnt, (mp_limb_t) e);
420       i = GMP_LIMB_BITS - cnt - 2;
421 
422 #if HAVE_NATIVE_mpn_mul_2
423       if (bsize <= 2)
424 	{
425 	  mp_limb_t  mult[2];
426 
427 	  /* Any bsize==1 will have been powered above to be two limbs. */
428 	  ASSERT (bsize == 2);
429 	  ASSERT (blimb != 0);
430 
431 	  /* Arrange the final result ends up in r, not in the temp space */
432 	  if ((i & 1) == 0)
433 	    SWAP_RP_TP;
434 
435 	  rp[0] = blimb_low;
436 	  rp[1] = blimb;
437 	  rsize = 2;
438 
439 	  mult[0] = blimb_low;
440 	  mult[1] = blimb;
441 
442 	  for ( ; i >= 0; i--)
443 	    {
444 	      TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
445 			     i, e, rsize, ralloc, talloc);
446 		     mpn_trace ("r", rp, rsize));
447 
448 	      MPN_SQR (tp, talloc, rp, rsize);
449 	      SWAP_RP_TP;
450 	      if ((e & (1L << i)) != 0)
451 		MPN_MUL_2 (rp, rsize, ralloc, mult);
452 	    }
453 
454 	  TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
455 	  if (rl_high != 0)
456 	    {
457 	      mult[0] = rl;
458 	      mult[1] = rl_high;
459 	      MPN_MUL_2 (rp, rsize, ralloc, mult);
460 	    }
461 	  else if (rl != 1)
462 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
463 	}
464 #else
465       if (bsize == 1)
466 	{
467 	  /* Arrange the final result ends up in r, not in the temp space */
468 	  if ((i & 1) == 0)
469 	    SWAP_RP_TP;
470 
471 	  rp[0] = blimb;
472 	  rsize = 1;
473 
474 	  for ( ; i >= 0; i--)
475 	    {
476 	      TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
477 			     i, e, rsize, ralloc, talloc);
478 		     mpn_trace ("r", rp, rsize));
479 
480 	      MPN_SQR (tp, talloc, rp, rsize);
481 	      SWAP_RP_TP;
482 	      if ((e & (1L << i)) != 0)
483 		MPN_MUL_1 (rp, rsize, ralloc, blimb);
484 	    }
485 
486 	  TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
487 	  if (rl != 1)
488 	    MPN_MUL_1 (rp, rsize, ralloc, rl);
489 	}
490 #endif
491       else
492 	{
493 	  int  parity;
494 
495 	  /* Arrange the final result ends up in r, not in the temp space */
496 	  ULONG_PARITY (parity, e);
497 	  if (((parity ^ i) & 1) != 0)
498 	    SWAP_RP_TP;
499 
500 	  MPN_COPY (rp, bp, bsize);
501 	  rsize = bsize;
502 
503 	  for ( ; i >= 0; i--)
504 	    {
505 	      TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
506 			     i, e, rsize, ralloc, talloc);
507 		     mpn_trace ("r", rp, rsize));
508 
509 	      MPN_SQR (tp, talloc, rp, rsize);
510 	      SWAP_RP_TP;
511 	      if ((e & (1L << i)) != 0)
512 		{
513 		  MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
514 		  SWAP_RP_TP;
515 		}
516 	    }
517 	}
518     }
519 
520   ASSERT (rp == PTR(r) + rtwos_limbs);
521   TRACE (mpn_trace ("end loop r", rp, rsize));
522   TMP_FREE;
523 
524   /* Apply any partial limb factors of 2. */
525   if (rtwos_bits != 0)
526     {
527       MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
528       TRACE (mpn_trace ("lshift r", rp, rsize));
529     }
530 
531   rsize += rtwos_limbs;
532   SIZ(r) = (rneg ? -rsize : rsize);
533 }
534