xref: /dragonfly/contrib/gmp/mpz/powm.c (revision 36a3d1d6)
1 /* mpz_powm(res,base,exp,mod) -- Set RES to (base**exp) mod MOD.
2 
3    Contributed by Paul Zimmermann.
4 
5 Copyright 1991, 1993, 1994, 1996, 1997, 2000, 2001, 2002, 2005, 2009
6 Free Software Foundation, Inc.
7 
8 This file is part of the GNU MP Library.
9 
10 The GNU MP Library is free software; you can redistribute it and/or modify
11 it under the terms of the GNU Lesser General Public License as published by
12 the Free Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
14 
15 The GNU MP Library is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
18 License for more details.
19 
20 You should have received a copy of the GNU Lesser General Public License
21 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
22 
23 
24 #include "gmp.h"
25 #include "gmp-impl.h"
26 #include "longlong.h"
27 #ifdef BERKELEY_MP
28 #include "mp.h"
29 #endif
30 
31 /* Compute t = a mod m, a is defined by (ap,an), m is defined by (mp,mn), and
32    t is defined by (tp,mn).  */
33 static void
34 reduce (mp_ptr tp, mp_srcptr ap, mp_size_t an, mp_srcptr mp, mp_size_t mn)
35 {
36   mp_ptr qp;
37   TMP_DECL;
38 
39   TMP_MARK;
40   qp = TMP_ALLOC_LIMBS (an - mn + 1);
41 
42   mpn_tdiv_qr (qp, tp, 0L, ap, an, mp, mn);
43 
44   TMP_FREE;
45 }
46 
47 #if REDUCE_EXPONENT
48 /* Return the group order of the ring mod m.  */
49 static mp_limb_t
50 phi (mp_limb_t t)
51 {
52   mp_limb_t d, m, go;
53 
54   go = 1;
55 
56   if (t % 2 == 0)
57     {
58       t = t / 2;
59       while (t % 2 == 0)
60 	{
61 	  go *= 2;
62 	  t = t / 2;
63 	}
64     }
65   for (d = 3;; d += 2)
66     {
67       m = d - 1;
68       for (;;)
69 	{
70 	  unsigned long int q = t / d;
71 	  if (q < d)
72 	    {
73 	      if (t <= 1)
74 		return go;
75 	      if (t == d)
76 		return go * m;
77 	      return go * (t - 1);
78 	    }
79 	  if (t != q * d)
80 	    break;
81 	  go *= m;
82 	  m = d;
83 	  t = q;
84 	}
85     }
86 }
87 #endif
88 
89 /* average number of calls to redc for an exponent of n bits
90    with the sliding window algorithm of base 2^k: the optimal is
91    obtained for the value of k which minimizes 2^(k-1)+n/(k+1):
92 
93    n\k    4     5     6     7     8
94    128    156*  159   171   200   261
95    256    309   307*  316   343   403
96    512    617   607*  610   632   688
97    1024   1231  1204  1195* 1207  1256
98    2048   2461  2399  2366  2360* 2396
99    4096   4918  4787  4707  4665* 4670
100 */
101 
102 
103 /* Use REDC instead of usual reduction for sizes < POWM_THRESHOLD.  In REDC
104    each modular multiplication costs about 2*n^2 limbs operations, whereas
105    using usual reduction it costs 3*K(n), where K(n) is the cost of a
106    multiplication using Karatsuba, and a division is assumed to cost 2*K(n),
107    for example using Burnikel-Ziegler's algorithm. This gives a theoretical
108    threshold of a*SQR_KARATSUBA_THRESHOLD, with a=(3/2)^(1/(2-ln(3)/ln(2))) ~
109    2.66.  */
110 /* For now, also disable REDC when MOD is even, as the inverse can't handle
111    that.  At some point, we might want to make the code faster for that case,
112    perhaps using CRR.  */
113 
114 #ifndef POWM_THRESHOLD
115 #define POWM_THRESHOLD  ((8 * SQR_KARATSUBA_THRESHOLD) / 3)
116 #endif
117 
118 #define HANDLE_NEGATIVE_EXPONENT 1
119 #undef REDUCE_EXPONENT
120 
121 void
122 #ifndef BERKELEY_MP
123 mpz_powm (mpz_ptr r, mpz_srcptr b, mpz_srcptr e, mpz_srcptr m)
124 #else /* BERKELEY_MP */
125 pow (mpz_srcptr b, mpz_srcptr e, mpz_srcptr m, mpz_ptr r)
126 #endif /* BERKELEY_MP */
127 {
128   mp_ptr xp, tp, qp, gp, this_gp;
129   mp_srcptr bp, ep, mp;
130   mp_size_t bn, es, en, mn, xn;
131   mp_limb_t invm, c;
132   unsigned long int enb;
133   mp_size_t i, K, j, l, k;
134   int m_zero_cnt, e_zero_cnt;
135   int sh;
136   int use_redc;
137 #if HANDLE_NEGATIVE_EXPONENT
138   mpz_t new_b;
139 #endif
140 #if REDUCE_EXPONENT
141   mpz_t new_e;
142 #endif
143   TMP_DECL;
144 
145   mp = PTR(m);
146   mn = ABSIZ (m);
147   if (mn == 0)
148     DIVIDE_BY_ZERO;
149 
150   TMP_MARK;
151 
152   es = SIZ (e);
153   if (es <= 0)
154     {
155       if (es == 0)
156 	{
157 	  /* Exponent is zero, result is 1 mod m, i.e., 1 or 0 depending on if
158 	     m equals 1.  */
159 	  SIZ(r) = (mn == 1 && mp[0] == 1) ? 0 : 1;
160 	  PTR(r)[0] = 1;
161 	  TMP_FREE;	/* we haven't really allocated anything here */
162 	  return;
163 	}
164 #if HANDLE_NEGATIVE_EXPONENT
165       MPZ_TMP_INIT (new_b, mn + 1);
166 
167       if (! mpz_invert (new_b, b, m))
168 	DIVIDE_BY_ZERO;
169       b = new_b;
170       es = -es;
171 #else
172       DIVIDE_BY_ZERO;
173 #endif
174     }
175   en = es;
176 
177 #if REDUCE_EXPONENT
178   /* Reduce exponent by dividing it by phi(m) when m small.  */
179   if (mn == 1 && mp[0] < 0x7fffffffL && en * GMP_NUMB_BITS > 150)
180     {
181       MPZ_TMP_INIT (new_e, 2);
182       mpz_mod_ui (new_e, e, phi (mp[0]));
183       e = new_e;
184     }
185 #endif
186 
187   use_redc = mn < POWM_THRESHOLD && mp[0] % 2 != 0;
188   if (use_redc)
189     {
190       /* invm = -1/m mod 2^BITS_PER_MP_LIMB, must have m odd */
191       modlimb_invert (invm, mp[0]);
192       invm = -invm;
193     }
194   else
195     {
196       /* Normalize m (i.e. make its most significant bit set) as required by
197 	 division functions below.  */
198       count_leading_zeros (m_zero_cnt, mp[mn - 1]);
199       m_zero_cnt -= GMP_NAIL_BITS;
200       if (m_zero_cnt != 0)
201 	{
202 	  mp_ptr new_mp;
203 	  new_mp = TMP_ALLOC_LIMBS (mn);
204 	  mpn_lshift (new_mp, mp, mn, m_zero_cnt);
205 	  mp = new_mp;
206 	}
207     }
208 
209   /* Determine optimal value of k, the number of exponent bits we look at
210      at a time.  */
211   count_leading_zeros (e_zero_cnt, PTR(e)[en - 1]);
212   e_zero_cnt -= GMP_NAIL_BITS;
213   enb = en * GMP_NUMB_BITS - e_zero_cnt; /* number of bits of exponent */
214   k = 1;
215   K = 2;
216   while (2 * enb > K * (2 + k * (3 + k)))
217     {
218       k++;
219       K *= 2;
220       if (k == 10)			/* cap allocation */
221 	break;
222     }
223 
224   tp = TMP_ALLOC_LIMBS (2 * mn);
225   qp = TMP_ALLOC_LIMBS (mn + 1);
226 
227   gp = __GMP_ALLOCATE_FUNC_LIMBS (K / 2 * mn);
228 
229   /* Compute x*R^n where R=2^BITS_PER_MP_LIMB.  */
230   bn = ABSIZ (b);
231   bp = PTR(b);
232   /* Handle |b| >= m by computing b mod m.  FIXME: It is not strictly necessary
233      for speed or correctness to do this when b and m have the same number of
234      limbs, perhaps remove mpn_cmp call.  */
235   if (bn > mn || (bn == mn && mpn_cmp (bp, mp, mn) >= 0))
236     {
237       /* Reduce possibly huge base while moving it to gp[0].  Use a function
238 	 call to reduce, since we don't want the quotient allocation to
239 	 live until function return.  */
240       if (use_redc)
241 	{
242 	  reduce (tp + mn, bp, bn, mp, mn);	/* b mod m */
243 	  MPN_ZERO (tp, mn);
244 	  mpn_tdiv_qr (qp, gp, 0L, tp, 2 * mn, mp, mn); /* unnormnalized! */
245 	}
246       else
247 	{
248 	  reduce (gp, bp, bn, mp, mn);
249 	}
250     }
251   else
252     {
253       /* |b| < m.  We pad out operands to become mn limbs,  which simplifies
254 	 the rest of the function, but slows things down when |b| << m.  */
255       if (use_redc)
256 	{
257 	  MPN_ZERO (tp, mn);
258 	  MPN_COPY (tp + mn, bp, bn);
259 	  MPN_ZERO (tp + mn + bn, mn - bn);
260 	  mpn_tdiv_qr (qp, gp, 0L, tp, 2 * mn, mp, mn);
261 	}
262       else
263 	{
264 	  MPN_COPY (gp, bp, bn);
265 	  MPN_ZERO (gp + bn, mn - bn);
266 	}
267     }
268 
269   /* Compute xx^i for odd g < 2^i.  */
270 
271   xp = TMP_ALLOC_LIMBS (mn);
272   mpn_sqr_n (tp, gp, mn);
273   if (use_redc)
274     mpn_redc_1 (xp, tp, mp, mn, invm);		/* xx = x^2*R^n */
275   else
276     mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn);
277   this_gp = gp;
278   for (i = 1; i < K / 2; i++)
279     {
280       mpn_mul_n (tp, this_gp, xp, mn);
281       this_gp += mn;
282       if (use_redc)
283 	mpn_redc_1 (this_gp, tp, mp, mn, invm);	/* g[i] = x^(2i+1)*R^n */
284       else
285 	mpn_tdiv_qr (qp, this_gp, 0L, tp, 2 * mn, mp, mn);
286     }
287 
288   /* Start the real stuff.  */
289   ep = PTR (e);
290   i = en - 1;				/* current index */
291   c = ep[i];				/* current limb */
292   sh = GMP_NUMB_BITS - e_zero_cnt;	/* significant bits in ep[i] */
293   sh -= k;				/* index of lower bit of ep[i] to take into account */
294   if (sh < 0)
295     {					/* k-sh extra bits are needed */
296       if (i > 0)
297 	{
298 	  i--;
299 	  c <<= (-sh);
300 	  sh += GMP_NUMB_BITS;
301 	  c |= ep[i] >> sh;
302 	}
303     }
304   else
305     c >>= sh;
306 
307   for (j = 0; c % 2 == 0; j++)
308     c >>= 1;
309 
310   MPN_COPY (xp, gp + mn * (c >> 1), mn);
311   while (--j >= 0)
312     {
313       mpn_sqr_n (tp, xp, mn);
314       if (use_redc)
315 	mpn_redc_1 (xp, tp, mp, mn, invm);
316       else
317 	mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn);
318     }
319 
320   while (i > 0 || sh > 0)
321     {
322       c = ep[i];
323       l = k;				/* number of bits treated */
324       sh -= l;
325       if (sh < 0)
326 	{
327 	  if (i > 0)
328 	    {
329 	      i--;
330 	      c <<= (-sh);
331 	      sh += GMP_NUMB_BITS;
332 	      c |= ep[i] >> sh;
333 	    }
334 	  else
335 	    {
336 	      l += sh;			/* last chunk of bits from e; l < k */
337 	    }
338 	}
339       else
340 	c >>= sh;
341       c &= ((mp_limb_t) 1 << l) - 1;
342 
343       /* This while loop implements the sliding window improvement--loop while
344 	 the most significant bit of c is zero, squaring xx as we go. */
345       while ((c >> (l - 1)) == 0 && (i > 0 || sh > 0))
346 	{
347 	  mpn_sqr_n (tp, xp, mn);
348 	  if (use_redc)
349 	    mpn_redc_1 (xp, tp, mp, mn, invm);
350 	  else
351 	    mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn);
352 	  if (sh != 0)
353 	    {
354 	      sh--;
355 	      c = (c << 1) + ((ep[i] >> sh) & 1);
356 	    }
357 	  else
358 	    {
359 	      i--;
360 	      sh = GMP_NUMB_BITS - 1;
361 	      c = (c << 1) + (ep[i] >> sh);
362 	    }
363 	}
364 
365       /* Replace xx by xx^(2^l)*x^c.  */
366       if (c != 0)
367 	{
368 	  for (j = 0; c % 2 == 0; j++)
369 	    c >>= 1;
370 
371 	  /* c0 = c * 2^j, i.e. xx^(2^l)*x^c = (A^(2^(l - j))*c)^(2^j) */
372 	  l -= j;
373 	  while (--l >= 0)
374 	    {
375 	      mpn_sqr_n (tp, xp, mn);
376 	      if (use_redc)
377 		mpn_redc_1 (xp, tp, mp, mn, invm);
378 	      else
379 		mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn);
380 	    }
381 	  mpn_mul_n (tp, xp, gp + mn * (c >> 1), mn);
382 	  if (use_redc)
383 	    mpn_redc_1 (xp, tp, mp, mn, invm);
384 	  else
385 	    mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn);
386 	}
387       else
388 	j = l;				/* case c=0 */
389       while (--j >= 0)
390 	{
391 	  mpn_sqr_n (tp, xp, mn);
392 	  if (use_redc)
393 	    mpn_redc_1 (xp, tp, mp, mn, invm);
394 	  else
395 	    mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn);
396 	}
397     }
398 
399   if (use_redc)
400     {
401       /* Convert back xx to xx/R^n.  */
402       MPN_COPY (tp, xp, mn);
403       MPN_ZERO (tp + mn, mn);
404       mpn_redc_1 (xp, tp, mp, mn, invm);
405       if (mpn_cmp (xp, mp, mn) >= 0)
406 	mpn_sub_n (xp, xp, mp, mn);
407     }
408   else
409     {
410       if (m_zero_cnt != 0)
411 	{
412 	  mp_limb_t cy;
413 	  cy = mpn_lshift (tp, xp, mn, m_zero_cnt);
414 	  tp[mn] = cy;
415 	  mpn_tdiv_qr (qp, xp, 0L, tp, mn + (cy != 0), mp, mn);
416 	  mpn_rshift (xp, xp, mn, m_zero_cnt);
417 	}
418     }
419   xn = mn;
420   MPN_NORMALIZE (xp, xn);
421 
422   if ((ep[0] & 1) && SIZ(b) < 0 && xn != 0)
423     {
424       mp = PTR(m);			/* want original, unnormalized m */
425       mpn_sub (xp, mp, mn, xp, xn);
426       xn = mn;
427       MPN_NORMALIZE (xp, xn);
428     }
429   MPZ_REALLOC (r, xn);
430   SIZ (r) = xn;
431   MPN_COPY (PTR(r), xp, xn);
432 
433   __GMP_FREE_FUNC_LIMBS (gp, K / 2 * mn);
434   TMP_FREE;
435 }
436