/* mpn_powm -- Compute R = U^E mod M.

Copyright 2007, 2008, 2009 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */


/*
  BASIC ALGORITHM, Compute b^e mod n, where n is odd.

  1. w <- b

  2. While w^2 < n (and there are more bits in e)
       w <- power left-to-right base-2 without reduction

  3. t <- (B^n * b) / n                Convert to REDC form

  4. Compute power table of e-dependent size

  5. While there are more bits in e
       w <- power left-to-right base-k with reduction


  TODO:

   * Make getbits a macro, thereby allowing it to update the index operand.
     That will simplify the code using getbits.  (Perhaps make getbits' sibling
     getbit then have similar form, for symmetry.)

   * Write an itch function.

   * Choose window size without looping.  (Superoptimize or think(tm).)

   * How do we handle small bases?

   * This is slower than old mpz code, in particular if we base it on redc_1
     (use: #undef HAVE_NATIVE_mpn_addmul_2).  Why?

   * Make it sub-quadratic.

   * Call new division functions, not mpn_tdiv_qr.

   * Is redc obsolete with improved SB division?

   * Consider special code for one-limb M.

   * CRT for N = odd*2^t:
      Using Newton's method and 2-adic arithmetic:
        m1_inv_m2 = 1/odd mod 2^t
      Plain 2-adic (REDC) modexp:
        r1 = a ^ b mod odd
      Mullo+sqrlo-based modexp:
        r2 = a ^ b mod 2^t
      mullo, mul, add:
        r = ((r2 - r1) * m1_i_m2 mod 2^t) * odd + r1

   * How should we handle the redc1/redc2/redc2/redc4/redc_subquad choice?
     - redc1: T(binvert_1limb)  + e * (n)   * (T(mullo1x1) + n*T(addmul_1))
     - redc2: T(binvert_2limbs) + e * (n/2) * (T(mullo2x2) + n*T(addmul_2))
     - redc3: T(binvert_3limbs) + e * (n/3) * (T(mullo3x3) + n*T(addmul_3))
     This disregards the addmul_N constant term, but we could think of
     that as part of the respective mulloNxN.
*/

#include "gmp.h"
#include "gmp-impl.h"
#include "longlong.h"


#define getbit(p,bi) \
  ((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1)

static inline mp_limb_t
getbits (const mp_limb_t *p, unsigned long bi, int nbits)
{
  int nbits_in_r;
  mp_limb_t r;
  mp_size_t i;

  if (bi < nbits)
    {
      return p[0] & (((mp_limb_t) 1 << bi) - 1);
    }
  else
    {
      bi -= nbits;			/* bit index of low bit to extract */
      i = bi / GMP_LIMB_BITS;		/* word index of low bit to extract */
      bi %= GMP_LIMB_BITS;		/* bit index in low word */
      r = p[i] >> bi;			/* extract (low) bits */
      nbits_in_r = GMP_LIMB_BITS - bi;	/* number of bits now in r */
      if (nbits_in_r < nbits)		/* did we get enough bits? */
	r += p[i + 1] << nbits_in_r;	/* prepend bits from higher word */
      return r & (((mp_limb_t ) 1 << nbits) - 1);
    }
}

#undef HAVE_NATIVE_mpn_addmul_2

#ifndef HAVE_NATIVE_mpn_addmul_2
#define REDC_2_THRESHOLD		MP_SIZE_T_MAX
#endif

#ifndef REDC_2_THRESHOLD
#define REDC_2_THRESHOLD		4
#endif

static void mpn_redc_n () {ASSERT_ALWAYS(0);}

static inline int
win_size (unsigned long eb)
{
  int k;
  static unsigned long x[] = {1,7,25,81,241,673,1793,4609,11521,28161,~0ul};
  for (k = 0; eb > x[k]; k++)
    ;
  return k;
}

#define MPN_REDC_X(rp, tp, mp, n, mip)					\
  do {									\
    if (redc_x == 1)							\
      mpn_redc_1 (rp, tp, mp, n, mip[0]);				\
    else if (redc_x == 2)						\
      mpn_redc_2 (rp, tp, mp, n, mip);					\
    else								\
      mpn_redc_n (rp, tp, mp, n, mip);					\
  } while (0)

  /* Convert U to REDC form, U_r = B^n * U mod M */
static void
redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n)
{
  mp_ptr tp, qp;
  TMP_DECL;
  TMP_MARK;

  tp = TMP_ALLOC_LIMBS (un + n);
  qp = TMP_ALLOC_LIMBS (un + 1);	/* FIXME: Put at tp+? */

  MPN_ZERO (tp, n);
  MPN_COPY (tp + n, up, un);
  mpn_tdiv_qr (qp, rp, 0L, tp, un + n, mp, n);
  TMP_FREE;
}

/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0]
   Requires that mp[n-1..0] is odd.
   Requires that ep[en-1..0] is > 1.
   Uses scratch space tp[3n..0], i.e., 3n+1 words.  */
void
mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
	  mp_srcptr ep, mp_size_t en,
	  mp_srcptr mp, mp_size_t n, mp_ptr tp)
{
  mp_limb_t mip[2];
  int cnt;
  long ebi;
  int windowsize, this_windowsize;
  mp_limb_t expbits;
  mp_ptr pp, this_pp, last_pp;
  mp_ptr b2p;
  long i;
  int redc_x;
  TMP_DECL;

  ASSERT (en > 1 || (en == 1 && ep[0] > 1));
  ASSERT (n >= 1 && ((mp[0] & 1) != 0));

  TMP_MARK;

  count_leading_zeros (cnt, ep[en - 1]);
  ebi = en * GMP_LIMB_BITS - cnt;

#if 0
  if (bn < n)
    {
      /* Do the first few exponent bits without mod reductions,
	 until the result is greater than the mod argument.  */
      for (;;)
	{
	  mpn_sqr_n (tp, this_pp, tn);
	  tn = tn * 2 - 1,  tn += tp[tn] != 0;
	  if (getbit (ep, ebi) != 0)
	    mpn_mul (..., tp, tn, bp, bn);
	  ebi--;
	}
    }
#endif

  windowsize = win_size (ebi);

  if (BELOW_THRESHOLD (n, REDC_2_THRESHOLD))
    {
      binvert_limb (mip[0], mp[0]);
      mip[0] = -mip[0];
      redc_x = 1;
    }
#if defined (HAVE_NATIVE_mpn_addmul_2)
  else
    {
      mpn_binvert (mip, mp, 2, tp);
      mip[0] = -mip[0]; mip[1] = ~mip[1];
      redc_x = 2;
    }
#endif
#if 0
  mpn_binvert (mip, mp, n, tp);
  redc_x = 0;
#endif

  pp = TMP_ALLOC_LIMBS (n << (windowsize - 1));

  this_pp = pp;
  redcify (this_pp, bp, bn, mp, n);

  b2p = tp + 2*n;

  /* Store b^2 in b2.  */
  mpn_sqr_n (tp, this_pp, n);
  MPN_REDC_X (b2p, tp, mp, n, mip);

  /* Precompute odd powers of b and put them in the temporary area at pp.  */
  for (i = (1 << (windowsize - 1)) - 1; i > 0; i--)
    {
      last_pp = this_pp;
      this_pp += n;
      mpn_mul_n (tp, last_pp, b2p, n);
      MPN_REDC_X (this_pp, tp, mp, n, mip);
    }

  expbits = getbits (ep, ebi, windowsize);
  ebi -= windowsize;
  if (ebi < 0)
    ebi = 0;

  count_trailing_zeros (cnt, expbits);
  ebi += cnt;
  expbits >>= cnt;

  MPN_COPY (rp, pp + n * (expbits >> 1), n);

  while (ebi != 0)
    {
      while (getbit (ep, ebi) == 0)
	{
	  mpn_sqr_n (tp, rp, n);
	  MPN_REDC_X (rp, tp, mp, n, mip);
	  ebi--;
	  if (ebi == 0)
	    goto done;
	}

      /* The next bit of the exponent is 1.  Now extract the largest block of
	 bits <= windowsize, and such that the least significant bit is 1.  */

      expbits = getbits (ep, ebi, windowsize);
      ebi -= windowsize;
      this_windowsize = windowsize;
      if (ebi < 0)
	{
	  this_windowsize += ebi;
	  ebi = 0;
	}

      count_trailing_zeros (cnt, expbits);
      this_windowsize -= cnt;
      ebi += cnt;
      expbits >>= cnt;

      do
	{
	  mpn_sqr_n (tp, rp, n);
	  MPN_REDC_X (rp, tp, mp, n, mip);
	  this_windowsize--;
	}
      while (this_windowsize != 0);

      mpn_mul_n (tp, rp, pp + n * (expbits >> 1), n);
      MPN_REDC_X (rp, tp, mp, n, mip);
    }

 done:
  MPN_COPY (tp, rp, n);
  MPN_ZERO (tp + n, n);
  MPN_REDC_X (rp, tp, mp, n, mip);
  if (mpn_cmp (rp, mp, n) >= 0)
    mpn_sub_n (rp, rp, mp, n);
  TMP_FREE;
}