xref: /netbsd/external/lgpl3/gmp/dist/mpn/generic/powm.c (revision 671ea119)
14a1767b4Smrg /* mpn_powm -- Compute R = U^E mod M.
24a1767b4Smrg 
34a1767b4Smrg    Contributed to the GNU project by Torbjorn Granlund.
44a1767b4Smrg 
54a1767b4Smrg    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
64a1767b4Smrg    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
74a1767b4Smrg    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
84a1767b4Smrg 
9*671ea119Smrg Copyright 2007-2012, 2019 Free Software Foundation, Inc.
104a1767b4Smrg 
114a1767b4Smrg This file is part of the GNU MP Library.
124a1767b4Smrg 
134a1767b4Smrg The GNU MP Library is free software; you can redistribute it and/or modify
14f81b1c5bSmrg it under the terms of either:
15f81b1c5bSmrg 
16f81b1c5bSmrg   * the GNU Lesser General Public License as published by the Free
17f81b1c5bSmrg     Software Foundation; either version 3 of the License, or (at your
184a1767b4Smrg     option) any later version.
194a1767b4Smrg 
20f81b1c5bSmrg or
21f81b1c5bSmrg 
22f81b1c5bSmrg   * the GNU General Public License as published by the Free Software
23f81b1c5bSmrg     Foundation; either version 2 of the License, or (at your option) any
24f81b1c5bSmrg     later version.
25f81b1c5bSmrg 
26f81b1c5bSmrg or both in parallel, as here.
27f81b1c5bSmrg 
284a1767b4Smrg The GNU MP Library is distributed in the hope that it will be useful, but
294a1767b4Smrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30f81b1c5bSmrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31f81b1c5bSmrg for more details.
324a1767b4Smrg 
33f81b1c5bSmrg You should have received copies of the GNU General Public License and the
34f81b1c5bSmrg GNU Lesser General Public License along with the GNU MP Library.  If not,
35f81b1c5bSmrg see https://www.gnu.org/licenses/.  */
364a1767b4Smrg 
374a1767b4Smrg 
384a1767b4Smrg /*
394a1767b4Smrg   BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd.
404a1767b4Smrg 
414a1767b4Smrg   1. W <- U
424a1767b4Smrg 
434a1767b4Smrg   2. T <- (B^n * U) mod M                Convert to REDC form
444a1767b4Smrg 
454a1767b4Smrg   3. Compute table U^1, U^3, U^5... of E-dependent size
464a1767b4Smrg 
474a1767b4Smrg   4. While there are more bits in E
484a1767b4Smrg        W <- power left-to-right base-k
494a1767b4Smrg 
504a1767b4Smrg 
514a1767b4Smrg   TODO:
524a1767b4Smrg 
534a1767b4Smrg    * Make getbits a macro, thereby allowing it to update the index operand.
544a1767b4Smrg      That will simplify the code using getbits.  (Perhaps make getbits' sibling
554a1767b4Smrg      getbit then have similar form, for symmetry.)
564a1767b4Smrg 
574a1767b4Smrg    * Write an itch function.  Or perhaps get rid of tp parameter since the huge
584a1767b4Smrg      pp area is allocated locally anyway?
594a1767b4Smrg 
604a1767b4Smrg    * Choose window size without looping.  (Superoptimize or think(tm).)
614a1767b4Smrg 
624a1767b4Smrg    * Handle small bases with initial, reduction-free exponentiation.
634a1767b4Smrg 
644a1767b4Smrg    * Call new division functions, not mpn_tdiv_qr.
654a1767b4Smrg 
664a1767b4Smrg    * Consider special code for one-limb M.
674a1767b4Smrg 
684a1767b4Smrg    * How should we handle the redc1/redc2/redc_n choice?
694a1767b4Smrg      - redc1:  T(binvert_1limb)  + e * (n)   * (T(mullo-1x1) + n*T(addmul_1))
704a1767b4Smrg      - redc2:  T(binvert_2limbs) + e * (n/2) * (T(mullo-2x2) + n*T(addmul_2))
714a1767b4Smrg      - redc_n: T(binvert_nlimbs) + e * (T(mullo-nxn) + T(M(n)))
724a1767b4Smrg      This disregards the addmul_N constant term, but we could think of
734a1767b4Smrg      that as part of the respective mullo.
744a1767b4Smrg 
754a1767b4Smrg    * When U (the base) is small, we should start the exponentiation with plain
764a1767b4Smrg      operations, then convert that partial result to REDC form.
774a1767b4Smrg 
784a1767b4Smrg    * When U is just one limb, should it be handled without the k-ary tricks?
794a1767b4Smrg      We could keep a factor of B^n in W, but use U' = BU as base.  After
804a1767b4Smrg      multiplying by this (pseudo two-limb) number, we need to multiply by 1/B
814a1767b4Smrg      mod M.
824a1767b4Smrg */
834a1767b4Smrg 
844a1767b4Smrg #include "gmp-impl.h"
854a1767b4Smrg #include "longlong.h"
864a1767b4Smrg 
87*671ea119Smrg #undef MPN_REDC_0
88*671ea119Smrg #define MPN_REDC_0(rp, up, mp, invm)					\
89*671ea119Smrg   do {									\
90*671ea119Smrg     mp_limb_t p1, r0, u0, _dummy;					\
91*671ea119Smrg     u0 = *(up);								\
92*671ea119Smrg     umul_ppmm (p1, _dummy, *(mp), (u0 * (invm)) & GMP_NUMB_MASK);	\
93*671ea119Smrg     ASSERT (((u0 + _dummy) & GMP_NUMB_MASK) == 0);			\
94*671ea119Smrg     p1 += (u0 != 0);							\
95*671ea119Smrg     r0 = (up)[1] + p1;							\
96*671ea119Smrg     if (p1 > r0)							\
97*671ea119Smrg       r0 -= *(mp);							\
98*671ea119Smrg     *(rp) = r0;								\
99*671ea119Smrg   } while (0)
100*671ea119Smrg 
101d25e02daSmrg #undef MPN_REDC_1
102*671ea119Smrg #if HAVE_NATIVE_mpn_sbpi1_bdiv_r
103*671ea119Smrg #define MPN_REDC_1(rp, up, mp, n, invm)					\
104*671ea119Smrg   do {									\
105*671ea119Smrg     mp_limb_t cy;							\
106*671ea119Smrg     cy = mpn_sbpi1_bdiv_r (up, 2 * n, mp, n, invm);			\
107*671ea119Smrg     if (cy != 0)							\
108*671ea119Smrg       mpn_sub_n (rp, up + n, mp, n);					\
109*671ea119Smrg     else								\
110*671ea119Smrg       MPN_COPY (rp, up + n, n);						\
111*671ea119Smrg   } while (0)
112*671ea119Smrg #else
113d25e02daSmrg #define MPN_REDC_1(rp, up, mp, n, invm)					\
114d25e02daSmrg   do {									\
115d25e02daSmrg     mp_limb_t cy;							\
116d25e02daSmrg     cy = mpn_redc_1 (rp, up, mp, n, invm);				\
117d25e02daSmrg     if (cy != 0)							\
118d25e02daSmrg       mpn_sub_n (rp, rp, mp, n);					\
119d25e02daSmrg   } while (0)
120*671ea119Smrg #endif
121d25e02daSmrg 
122d25e02daSmrg #undef MPN_REDC_2
123d25e02daSmrg #define MPN_REDC_2(rp, up, mp, n, mip)					\
124d25e02daSmrg   do {									\
125d25e02daSmrg     mp_limb_t cy;							\
126d25e02daSmrg     cy = mpn_redc_2 (rp, up, mp, n, mip);				\
127d25e02daSmrg     if (cy != 0)							\
128d25e02daSmrg       mpn_sub_n (rp, rp, mp, n);					\
129d25e02daSmrg   } while (0)
130d25e02daSmrg 
1314a1767b4Smrg #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
1324a1767b4Smrg #define WANT_REDC_2 1
1334a1767b4Smrg #endif
1344a1767b4Smrg 
1354a1767b4Smrg #define getbit(p,bi) \
1364a1767b4Smrg   ((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1)
1374a1767b4Smrg 
1384a1767b4Smrg static inline mp_limb_t
getbits(const mp_limb_t * p,mp_bitcnt_t bi,int nbits)1394a1767b4Smrg getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
1404a1767b4Smrg {
1414a1767b4Smrg   int nbits_in_r;
1424a1767b4Smrg   mp_limb_t r;
1434a1767b4Smrg   mp_size_t i;
1444a1767b4Smrg 
1454a1767b4Smrg   if (bi < nbits)
1464a1767b4Smrg     {
1474a1767b4Smrg       return p[0] & (((mp_limb_t) 1 << bi) - 1);
1484a1767b4Smrg     }
1494a1767b4Smrg   else
1504a1767b4Smrg     {
1514a1767b4Smrg       bi -= nbits;			/* bit index of low bit to extract */
1524a1767b4Smrg       i = bi / GMP_NUMB_BITS;		/* word index of low bit to extract */
1534a1767b4Smrg       bi %= GMP_NUMB_BITS;		/* bit index in low word */
1544a1767b4Smrg       r = p[i] >> bi;			/* extract (low) bits */
1554a1767b4Smrg       nbits_in_r = GMP_NUMB_BITS - bi;	/* number of bits now in r */
1564a1767b4Smrg       if (nbits_in_r < nbits)		/* did we get enough bits? */
1574a1767b4Smrg 	r += p[i + 1] << nbits_in_r;	/* prepend bits from higher word */
1584a1767b4Smrg       return r & (((mp_limb_t) 1 << nbits) - 1);
1594a1767b4Smrg     }
1604a1767b4Smrg }
1614a1767b4Smrg 
1624a1767b4Smrg static inline int
win_size(mp_bitcnt_t eb)1634a1767b4Smrg win_size (mp_bitcnt_t eb)
1644a1767b4Smrg {
1654a1767b4Smrg   int k;
1664a1767b4Smrg   static mp_bitcnt_t x[] = {0,7,25,81,241,673,1793,4609,11521,28161,~(mp_bitcnt_t)0};
1674a1767b4Smrg   for (k = 1; eb > x[k]; k++)
1684a1767b4Smrg     ;
1694a1767b4Smrg   return k;
1704a1767b4Smrg }
1714a1767b4Smrg 
1724a1767b4Smrg /* Convert U to REDC form, U_r = B^n * U mod M */
1734a1767b4Smrg static void
redcify(mp_ptr rp,mp_srcptr up,mp_size_t un,mp_srcptr mp,mp_size_t n)1744a1767b4Smrg redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n)
1754a1767b4Smrg {
1764a1767b4Smrg   mp_ptr tp, qp;
1774a1767b4Smrg   TMP_DECL;
1784a1767b4Smrg   TMP_MARK;
1794a1767b4Smrg 
180f81b1c5bSmrg   TMP_ALLOC_LIMBS_2 (tp, un + n, qp, un + 1);
1814a1767b4Smrg 
1824a1767b4Smrg   MPN_ZERO (tp, n);
1834a1767b4Smrg   MPN_COPY (tp + n, up, un);
1844a1767b4Smrg   mpn_tdiv_qr (qp, rp, 0L, tp, un + n, mp, n);
1854a1767b4Smrg   TMP_FREE;
1864a1767b4Smrg }
1874a1767b4Smrg 
1884a1767b4Smrg /* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0]
1894a1767b4Smrg    Requires that mp[n-1..0] is odd.
1904a1767b4Smrg    Requires that ep[en-1..0] is > 1.
1914a1767b4Smrg    Uses scratch space at tp of MAX(mpn_binvert_itch(n),2n) limbs.  */
1924a1767b4Smrg void
mpn_powm(mp_ptr rp,mp_srcptr bp,mp_size_t bn,mp_srcptr ep,mp_size_t en,mp_srcptr mp,mp_size_t n,mp_ptr tp)1934a1767b4Smrg mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
1944a1767b4Smrg 	  mp_srcptr ep, mp_size_t en,
1954a1767b4Smrg 	  mp_srcptr mp, mp_size_t n, mp_ptr tp)
1964a1767b4Smrg {
1974a1767b4Smrg   mp_limb_t ip[2], *mip;
1984a1767b4Smrg   int cnt;
1994a1767b4Smrg   mp_bitcnt_t ebi;
2004a1767b4Smrg   int windowsize, this_windowsize;
2014a1767b4Smrg   mp_limb_t expbits;
2024a1767b4Smrg   mp_ptr pp, this_pp;
2034a1767b4Smrg   long i;
2044a1767b4Smrg   TMP_DECL;
2054a1767b4Smrg 
2064a1767b4Smrg   ASSERT (en > 1 || (en == 1 && ep[0] > 1));
2074a1767b4Smrg   ASSERT (n >= 1 && ((mp[0] & 1) != 0));
2084a1767b4Smrg 
2094a1767b4Smrg   TMP_MARK;
2104a1767b4Smrg 
211d25e02daSmrg   MPN_SIZEINBASE_2EXP(ebi, ep, en, 1);
2124a1767b4Smrg 
2134a1767b4Smrg #if 0
2144a1767b4Smrg   if (bn < n)
2154a1767b4Smrg     {
2164a1767b4Smrg       /* Do the first few exponent bits without mod reductions,
2174a1767b4Smrg 	 until the result is greater than the mod argument.  */
2184a1767b4Smrg       for (;;)
2194a1767b4Smrg 	{
2204a1767b4Smrg 	  mpn_sqr (tp, this_pp, tn);
2214a1767b4Smrg 	  tn = tn * 2 - 1,  tn += tp[tn] != 0;
2224a1767b4Smrg 	  if (getbit (ep, ebi) != 0)
2234a1767b4Smrg 	    mpn_mul (..., tp, tn, bp, bn);
2244a1767b4Smrg 	  ebi--;
2254a1767b4Smrg 	}
2264a1767b4Smrg     }
2274a1767b4Smrg #endif
2284a1767b4Smrg 
2294a1767b4Smrg   windowsize = win_size (ebi);
2304a1767b4Smrg 
2314a1767b4Smrg #if WANT_REDC_2
2324a1767b4Smrg   if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
2334a1767b4Smrg     {
2344a1767b4Smrg       mip = ip;
2354a1767b4Smrg       binvert_limb (mip[0], mp[0]);
2364a1767b4Smrg       mip[0] = -mip[0];
2374a1767b4Smrg     }
2384a1767b4Smrg   else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
2394a1767b4Smrg     {
2404a1767b4Smrg       mip = ip;
2414a1767b4Smrg       mpn_binvert (mip, mp, 2, tp);
2424a1767b4Smrg       mip[0] = -mip[0]; mip[1] = ~mip[1];
2434a1767b4Smrg     }
2444a1767b4Smrg #else
2454a1767b4Smrg   if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
2464a1767b4Smrg     {
2474a1767b4Smrg       mip = ip;
2484a1767b4Smrg       binvert_limb (mip[0], mp[0]);
2494a1767b4Smrg       mip[0] = -mip[0];
2504a1767b4Smrg     }
2514a1767b4Smrg #endif
2524a1767b4Smrg   else
2534a1767b4Smrg     {
2544a1767b4Smrg       mip = TMP_ALLOC_LIMBS (n);
2554a1767b4Smrg       mpn_binvert (mip, mp, n, tp);
2564a1767b4Smrg     }
2574a1767b4Smrg 
2584a1767b4Smrg   pp = TMP_ALLOC_LIMBS (n << (windowsize - 1));
2594a1767b4Smrg 
2604a1767b4Smrg   this_pp = pp;
2614a1767b4Smrg   redcify (this_pp, bp, bn, mp, n);
2624a1767b4Smrg 
2634a1767b4Smrg   /* Store b^2 at rp.  */
2644a1767b4Smrg   mpn_sqr (tp, this_pp, n);
265*671ea119Smrg #if 0
266*671ea119Smrg   if (n == 1) {
267*671ea119Smrg     MPN_REDC_0 (rp, tp, mp, mip[0]);
268*671ea119Smrg   } else
269*671ea119Smrg #endif
2704a1767b4Smrg #if WANT_REDC_2
2714a1767b4Smrg   if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
272d25e02daSmrg     MPN_REDC_1 (rp, tp, mp, n, mip[0]);
2734a1767b4Smrg   else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
274d25e02daSmrg     MPN_REDC_2 (rp, tp, mp, n, mip);
2754a1767b4Smrg #else
2764a1767b4Smrg   if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
277d25e02daSmrg     MPN_REDC_1 (rp, tp, mp, n, mip[0]);
2784a1767b4Smrg #endif
2794a1767b4Smrg   else
2804a1767b4Smrg     mpn_redc_n (rp, tp, mp, n, mip);
2814a1767b4Smrg 
2824a1767b4Smrg   /* Precompute odd powers of b and put them in the temporary area at pp.  */
2834a1767b4Smrg   for (i = (1 << (windowsize - 1)) - 1; i > 0; i--)
284*671ea119Smrg #if 1
285*671ea119Smrg     if (n == 1) {
286*671ea119Smrg       umul_ppmm((tp)[1], *(tp), *(this_pp), *(rp));
287*671ea119Smrg       ++this_pp ;
288*671ea119Smrg       MPN_REDC_0 (this_pp, tp, mp, mip[0]);
289*671ea119Smrg     } else
290*671ea119Smrg #endif
2914a1767b4Smrg     {
2924a1767b4Smrg       mpn_mul_n (tp, this_pp, rp, n);
2934a1767b4Smrg       this_pp += n;
2944a1767b4Smrg #if WANT_REDC_2
2954a1767b4Smrg       if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
296d25e02daSmrg 	MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
2974a1767b4Smrg       else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
298d25e02daSmrg 	MPN_REDC_2 (this_pp, tp, mp, n, mip);
2994a1767b4Smrg #else
3004a1767b4Smrg       if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
301d25e02daSmrg 	MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
3024a1767b4Smrg #endif
3034a1767b4Smrg       else
3044a1767b4Smrg 	mpn_redc_n (this_pp, tp, mp, n, mip);
3054a1767b4Smrg     }
3064a1767b4Smrg 
3074a1767b4Smrg   expbits = getbits (ep, ebi, windowsize);
3084a1767b4Smrg   if (ebi < windowsize)
3094a1767b4Smrg     ebi = 0;
3104a1767b4Smrg   else
3114a1767b4Smrg     ebi -= windowsize;
3124a1767b4Smrg 
3134a1767b4Smrg   count_trailing_zeros (cnt, expbits);
3144a1767b4Smrg   ebi += cnt;
3154a1767b4Smrg   expbits >>= cnt;
3164a1767b4Smrg 
3174a1767b4Smrg   MPN_COPY (rp, pp + n * (expbits >> 1), n);
3184a1767b4Smrg 
3194a1767b4Smrg #define INNERLOOP							\
3204a1767b4Smrg   while (ebi != 0)							\
3214a1767b4Smrg     {									\
3224a1767b4Smrg       while (getbit (ep, ebi) == 0)					\
3234a1767b4Smrg 	{								\
3244a1767b4Smrg 	  MPN_SQR (tp, rp, n);						\
3254a1767b4Smrg 	  MPN_REDUCE (rp, tp, mp, n, mip);				\
326*671ea119Smrg 	  if (--ebi == 0)						\
3274a1767b4Smrg 	    goto done;							\
3284a1767b4Smrg 	}								\
3294a1767b4Smrg 									\
3304a1767b4Smrg       /* The next bit of the exponent is 1.  Now extract the largest	\
3314a1767b4Smrg 	 block of bits <= windowsize, and such that the least		\
3324a1767b4Smrg 	 significant bit is 1.  */					\
3334a1767b4Smrg 									\
3344a1767b4Smrg       expbits = getbits (ep, ebi, windowsize);				\
3354a1767b4Smrg       this_windowsize = windowsize;					\
3364a1767b4Smrg       if (ebi < windowsize)						\
3374a1767b4Smrg 	{								\
3384a1767b4Smrg 	  this_windowsize -= windowsize - ebi;				\
3394a1767b4Smrg 	  ebi = 0;							\
3404a1767b4Smrg 	}								\
3414a1767b4Smrg       else								\
3424a1767b4Smrg         ebi -= windowsize;						\
3434a1767b4Smrg 									\
3444a1767b4Smrg       count_trailing_zeros (cnt, expbits);				\
3454a1767b4Smrg       this_windowsize -= cnt;						\
3464a1767b4Smrg       ebi += cnt;							\
3474a1767b4Smrg       expbits >>= cnt;							\
3484a1767b4Smrg 									\
3494a1767b4Smrg       do								\
3504a1767b4Smrg 	{								\
3514a1767b4Smrg 	  MPN_SQR (tp, rp, n);						\
3524a1767b4Smrg 	  MPN_REDUCE (rp, tp, mp, n, mip);				\
3534a1767b4Smrg 	}								\
354*671ea119Smrg       while (--this_windowsize != 0);					\
3554a1767b4Smrg 									\
3564a1767b4Smrg       MPN_MUL_N (tp, rp, pp + n * (expbits >> 1), n);			\
3574a1767b4Smrg       MPN_REDUCE (rp, tp, mp, n, mip);					\
3584a1767b4Smrg     }
3594a1767b4Smrg 
3604a1767b4Smrg 
361*671ea119Smrg   if (n == 1)
362*671ea119Smrg     {
363*671ea119Smrg #undef MPN_MUL_N
364*671ea119Smrg #undef MPN_SQR
365*671ea119Smrg #undef MPN_REDUCE
366*671ea119Smrg #define MPN_MUL_N(r,a,b,n)		umul_ppmm((r)[1], *(r), *(a), *(b))
367*671ea119Smrg #define MPN_SQR(r,a,n)			umul_ppmm((r)[1], *(r), *(a), *(a))
368*671ea119Smrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_0(rp, tp, mp, mip[0])
369*671ea119Smrg       INNERLOOP;
370*671ea119Smrg     }
371*671ea119Smrg   else
3724a1767b4Smrg #if WANT_REDC_2
3734a1767b4Smrg   if (REDC_1_TO_REDC_2_THRESHOLD < MUL_TOOM22_THRESHOLD)
3744a1767b4Smrg     {
3754a1767b4Smrg       if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
3764a1767b4Smrg 	{
377d25e02daSmrg 	  if (REDC_1_TO_REDC_2_THRESHOLD < SQR_BASECASE_THRESHOLD
378d25e02daSmrg 	      || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
379d25e02daSmrg 	    {
3804a1767b4Smrg #undef MPN_MUL_N
3814a1767b4Smrg #undef MPN_SQR
3824a1767b4Smrg #undef MPN_REDUCE
3834a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
384d25e02daSmrg #define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
385d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
3864a1767b4Smrg 	      INNERLOOP;
3874a1767b4Smrg 	    }
388d25e02daSmrg 	  else
3894a1767b4Smrg 	    {
3904a1767b4Smrg #undef MPN_MUL_N
3914a1767b4Smrg #undef MPN_SQR
3924a1767b4Smrg #undef MPN_REDUCE
3934a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
3944a1767b4Smrg #define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
395d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
3964a1767b4Smrg 	      INNERLOOP;
3974a1767b4Smrg 	    }
398d25e02daSmrg 	}
399d25e02daSmrg       else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
400d25e02daSmrg 	{
401d25e02daSmrg 	  if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
402d25e02daSmrg 	      || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
403d25e02daSmrg 	    {
404d25e02daSmrg #undef MPN_MUL_N
405d25e02daSmrg #undef MPN_SQR
406d25e02daSmrg #undef MPN_REDUCE
407d25e02daSmrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
408d25e02daSmrg #define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
409d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_2 (rp, tp, mp, n, mip)
410d25e02daSmrg 	      INNERLOOP;
411d25e02daSmrg 	    }
412d25e02daSmrg 	  else
413d25e02daSmrg 	    {
414d25e02daSmrg #undef MPN_MUL_N
415d25e02daSmrg #undef MPN_SQR
416d25e02daSmrg #undef MPN_REDUCE
417d25e02daSmrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
418d25e02daSmrg #define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
419d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_2 (rp, tp, mp, n, mip)
420d25e02daSmrg 	      INNERLOOP;
421d25e02daSmrg 	    }
422d25e02daSmrg 	}
4234a1767b4Smrg       else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
4244a1767b4Smrg 	{
4254a1767b4Smrg #undef MPN_MUL_N
4264a1767b4Smrg #undef MPN_SQR
4274a1767b4Smrg #undef MPN_REDUCE
4284a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
4294a1767b4Smrg #define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
430d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_2 (rp, tp, mp, n, mip)
4314a1767b4Smrg 	  INNERLOOP;
4324a1767b4Smrg 	}
4334a1767b4Smrg       else
4344a1767b4Smrg 	{
4354a1767b4Smrg #undef MPN_MUL_N
4364a1767b4Smrg #undef MPN_SQR
4374a1767b4Smrg #undef MPN_REDUCE
4384a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
4394a1767b4Smrg #define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
4404a1767b4Smrg #define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
4414a1767b4Smrg 	  INNERLOOP;
4424a1767b4Smrg 	}
4434a1767b4Smrg     }
4444a1767b4Smrg   else
4454a1767b4Smrg     {
4464a1767b4Smrg       if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
4474a1767b4Smrg 	{
448d25e02daSmrg 	  if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
449d25e02daSmrg 	      || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
450d25e02daSmrg 	    {
451d25e02daSmrg #undef MPN_MUL_N
452d25e02daSmrg #undef MPN_SQR
453d25e02daSmrg #undef MPN_REDUCE
454d25e02daSmrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
455d25e02daSmrg #define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
456d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
457d25e02daSmrg 	      INNERLOOP;
458d25e02daSmrg 	    }
459d25e02daSmrg 	  else
460d25e02daSmrg 	    {
4614a1767b4Smrg #undef MPN_MUL_N
4624a1767b4Smrg #undef MPN_SQR
4634a1767b4Smrg #undef MPN_REDUCE
4644a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
4654a1767b4Smrg #define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
466d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
4674a1767b4Smrg 	      INNERLOOP;
4684a1767b4Smrg 	    }
469d25e02daSmrg 	}
4704a1767b4Smrg       else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
4714a1767b4Smrg 	{
4724a1767b4Smrg #undef MPN_MUL_N
4734a1767b4Smrg #undef MPN_SQR
4744a1767b4Smrg #undef MPN_REDUCE
4754a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
4764a1767b4Smrg #define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
477d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
4784a1767b4Smrg 	  INNERLOOP;
4794a1767b4Smrg 	}
4804a1767b4Smrg       else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
4814a1767b4Smrg 	{
4824a1767b4Smrg #undef MPN_MUL_N
4834a1767b4Smrg #undef MPN_SQR
4844a1767b4Smrg #undef MPN_REDUCE
4854a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
4864a1767b4Smrg #define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
487d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_2 (rp, tp, mp, n, mip)
4884a1767b4Smrg 	  INNERLOOP;
4894a1767b4Smrg 	}
4904a1767b4Smrg       else
4914a1767b4Smrg 	{
4924a1767b4Smrg #undef MPN_MUL_N
4934a1767b4Smrg #undef MPN_SQR
4944a1767b4Smrg #undef MPN_REDUCE
4954a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
4964a1767b4Smrg #define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
4974a1767b4Smrg #define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
4984a1767b4Smrg 	  INNERLOOP;
4994a1767b4Smrg 	}
5004a1767b4Smrg     }
5014a1767b4Smrg 
5024a1767b4Smrg #else  /* WANT_REDC_2 */
5034a1767b4Smrg 
5044a1767b4Smrg   if (REDC_1_TO_REDC_N_THRESHOLD < MUL_TOOM22_THRESHOLD)
5054a1767b4Smrg     {
5064a1767b4Smrg       if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
5074a1767b4Smrg 	{
508d25e02daSmrg 	  if (REDC_1_TO_REDC_N_THRESHOLD < SQR_BASECASE_THRESHOLD
509d25e02daSmrg 	      || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
510d25e02daSmrg 	    {
511d25e02daSmrg #undef MPN_MUL_N
512d25e02daSmrg #undef MPN_SQR
513d25e02daSmrg #undef MPN_REDUCE
514d25e02daSmrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
515d25e02daSmrg #define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
516d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
517d25e02daSmrg 	      INNERLOOP;
518d25e02daSmrg 	    }
519d25e02daSmrg 	  else
520d25e02daSmrg 	    {
5214a1767b4Smrg #undef MPN_MUL_N
5224a1767b4Smrg #undef MPN_SQR
5234a1767b4Smrg #undef MPN_REDUCE
5244a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
5254a1767b4Smrg #define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
526d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
5274a1767b4Smrg 	      INNERLOOP;
5284a1767b4Smrg 	    }
529d25e02daSmrg 	}
5304a1767b4Smrg       else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
5314a1767b4Smrg 	{
532d25e02daSmrg 	  if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
533d25e02daSmrg 	      || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
534d25e02daSmrg 	    {
535d25e02daSmrg #undef MPN_MUL_N
536d25e02daSmrg #undef MPN_SQR
537d25e02daSmrg #undef MPN_REDUCE
538d25e02daSmrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
539d25e02daSmrg #define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
540d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
541d25e02daSmrg 	      INNERLOOP;
542d25e02daSmrg 	    }
543d25e02daSmrg 	  else
544d25e02daSmrg 	    {
5454a1767b4Smrg #undef MPN_MUL_N
5464a1767b4Smrg #undef MPN_SQR
5474a1767b4Smrg #undef MPN_REDUCE
5484a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
5494a1767b4Smrg #define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
5504a1767b4Smrg #define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
5514a1767b4Smrg 	      INNERLOOP;
5524a1767b4Smrg 	    }
553d25e02daSmrg 	}
5544a1767b4Smrg       else
5554a1767b4Smrg 	{
5564a1767b4Smrg #undef MPN_MUL_N
5574a1767b4Smrg #undef MPN_SQR
5584a1767b4Smrg #undef MPN_REDUCE
5594a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
5604a1767b4Smrg #define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
5614a1767b4Smrg #define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
5624a1767b4Smrg 	  INNERLOOP;
5634a1767b4Smrg 	}
5644a1767b4Smrg     }
5654a1767b4Smrg   else
5664a1767b4Smrg     {
5674a1767b4Smrg       if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
5684a1767b4Smrg 	{
569d25e02daSmrg 	  if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
570d25e02daSmrg 	      || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
571d25e02daSmrg 	    {
572d25e02daSmrg #undef MPN_MUL_N
573d25e02daSmrg #undef MPN_SQR
574d25e02daSmrg #undef MPN_REDUCE
575d25e02daSmrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
576d25e02daSmrg #define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
577d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
578d25e02daSmrg 	      INNERLOOP;
579d25e02daSmrg 	    }
580d25e02daSmrg 	  else
581d25e02daSmrg 	    {
5824a1767b4Smrg #undef MPN_MUL_N
5834a1767b4Smrg #undef MPN_SQR
5844a1767b4Smrg #undef MPN_REDUCE
5854a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
5864a1767b4Smrg #define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
587d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
5884a1767b4Smrg 	      INNERLOOP;
5894a1767b4Smrg 	    }
590d25e02daSmrg 	}
5914a1767b4Smrg       else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
5924a1767b4Smrg 	{
5934a1767b4Smrg #undef MPN_MUL_N
5944a1767b4Smrg #undef MPN_SQR
5954a1767b4Smrg #undef MPN_REDUCE
5964a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
5974a1767b4Smrg #define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
598d25e02daSmrg #define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
5994a1767b4Smrg 	  INNERLOOP;
6004a1767b4Smrg 	}
6014a1767b4Smrg       else
6024a1767b4Smrg 	{
6034a1767b4Smrg #undef MPN_MUL_N
6044a1767b4Smrg #undef MPN_SQR
6054a1767b4Smrg #undef MPN_REDUCE
6064a1767b4Smrg #define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
6074a1767b4Smrg #define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
6084a1767b4Smrg #define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
6094a1767b4Smrg 	  INNERLOOP;
6104a1767b4Smrg 	}
6114a1767b4Smrg     }
6124a1767b4Smrg #endif  /* WANT_REDC_2 */
6134a1767b4Smrg 
6144a1767b4Smrg  done:
6154a1767b4Smrg 
6164a1767b4Smrg   MPN_COPY (tp, rp, n);
6174a1767b4Smrg   MPN_ZERO (tp + n, n);
6184a1767b4Smrg 
6194a1767b4Smrg #if WANT_REDC_2
6204a1767b4Smrg   if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
621d25e02daSmrg     MPN_REDC_1 (rp, tp, mp, n, mip[0]);
6224a1767b4Smrg   else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
623d25e02daSmrg     MPN_REDC_2 (rp, tp, mp, n, mip);
6244a1767b4Smrg #else
6254a1767b4Smrg   if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
626d25e02daSmrg     MPN_REDC_1 (rp, tp, mp, n, mip[0]);
6274a1767b4Smrg #endif
6284a1767b4Smrg   else
6294a1767b4Smrg     mpn_redc_n (rp, tp, mp, n, mip);
6304a1767b4Smrg 
6314a1767b4Smrg   if (mpn_cmp (rp, mp, n) >= 0)
6324a1767b4Smrg     mpn_sub_n (rp, rp, mp, n);
6334a1767b4Smrg 
6344a1767b4Smrg   TMP_FREE;
6354a1767b4Smrg }
636