gmp-ecm/ecm-7.0.4/pm1fs2.c

/* Implementation of fast stage 2 for P-1 and P+1 as described in
   "Improved Stage 2 to $P\pm{}1$ Factoring Algorithms" by
   Peter L. Montgomery and Alexander Kruppa, ANTS 2008 (8th Algorithmic
   Number Theory Symposium).

Copyright 2007, 2008, 2009, 2010, 2011, 2012, 2016
Alexander Kruppa, Paul Zimmermann, David Cleaver
NTT functions are based on code Copyright 2005 Dave Newman.

This file is part of the ECM Library.

The ECM Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The ECM Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the ECM Library; see the file COPYING.LIB.  If not, see
http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */

#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include "ecm-impl.h"
#include "sp.h"
#include <math.h>
#ifdef HAVE_ALLOCA_H
#include <alloca.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#ifdef _OPENMP
#include <omp.h>
#endif

/* TODO:
   - move functions into their proper files (i.e. NTT functions etc.)
   - later: allow storing NTT vectors on disk
*/

/* Define TEST_ZERO_RESULT to test if any result of the multipoint
   evaluation is equal to zero. If the modulus is composite, this
   happening might indicate a problem in the evalutaion code */
#define TEST_ZERO_RESULT

const int pari = 0;

const unsigned long Pvalues[] = {
    3UL, 5UL, 9UL, 15UL, 21UL, 17UL, 27UL, 33UL, 45UL, 51UL, 63UL, 75UL,
    105UL, 99UL, 135UL, 165UL, 195UL, 189UL, 231UL, 255UL, 315UL, 345UL,
    357UL, 375UL, 405UL, 435UL, 525UL, 585UL, 615UL, 735UL, 765UL, 825UL,
    945UL, 1155UL, 1065UL, 1365UL, 1305UL, 1335UL, 1575UL, 1785UL, 1995UL,
    2145UL, 2205UL, 2415UL, 2625UL, 2805UL, 3045UL, 3465UL, 3675UL, 4095UL,
    4305UL, 4515UL, 4725UL, 4785UL, 5355UL, 5775UL, 5985UL, 5865UL, 6825UL,
    7245UL, 8085UL, 8925UL, 9555UL, 10395UL, 10725UL, 11025UL, 12285UL,
    12705UL, 15015UL, 14175UL, 15225UL, 16065UL, 17325UL, 19635UL, 21945UL,
    23205UL, 24255UL, 25935UL, 26775UL, 28875UL, 31395UL, 33495UL, 35805UL,
    36465UL, 38115UL, 39585UL, 40425UL, 45045UL, 45885UL, 49665UL, 51765UL,
    58905UL, 65835UL, 69615UL, 75075UL, 77805UL, 82005UL, 84315UL, 86625UL,
    88935UL, 94185UL, 98175UL, 105105UL, 109725UL, 116025UL, 118755UL,
    121275UL, 135135UL, 137445UL, 137655UL, 144375UL, 153615UL, 165165UL,
    167475UL, 176715UL, 179025UL, 185955UL, 197505UL, 208845UL, 215985UL,
    225225UL, 255255UL, 250635UL, 285285UL, 277095UL, 294525UL, 315315UL,
    345345UL, 373065UL, 368445UL, 405405UL, 435435UL, 451605UL, 465465UL,
    454545UL, 504735UL, 525525UL, 555555UL, 569415UL, 596505UL, 645645UL,
    647955UL, 672945UL, 687225UL, 765765UL, 770385UL, 805035UL, 855855UL,
    858585UL, 915915UL, 945945UL, 962115UL, 1036035UL, 1066065UL, 1119195UL,
    1156155UL, 1276275UL, 1306305UL, 1354815UL, 1426425UL, 1456455UL,
    1514205UL, 1576575UL, 1666665UL, 1726725UL, 1786785UL, 1789515UL,
    1865325UL, 1996995UL, 1983135UL, 2177175UL, 2297295UL, 2327325UL,
    2417415UL, 2567565UL, 2611455UL, 2807805UL, 2847075UL, 2878785UL,
    3048045UL, 3161235UL, 3258255UL, 3357585UL, 3401475UL, 3533145UL,
    3828825UL, 3918915UL, 3985905UL, 4279275UL, 4849845UL, 4789785UL,
    4967655UL, 5180175UL, 5360355UL, 5870865UL, 5990985UL, 6561555UL,
    6531525UL, 6891885UL, 7402395UL, 7912905UL, 8273265UL, 8580495UL,
    8843835UL, 9444435UL, 10015005UL, 10465455UL, 10705695UL, 10885875UL,
    11696685UL, 12267255UL, 12507495UL, 12785955UL, 13498485UL, 14549535UL,
    14849835UL, 15570555UL, 16111095UL, 16291275UL, 17612595UL, 18123105UL,
    18633615UL, 19684665UL, 20255235UL, 20825805UL, 22207185UL, 22717695UL,
    24249225UL, 24819795UL, 25741485UL, 26531505UL, 28333305UL, 29354325UL,
    30045015UL, 31396365UL, 32807775UL, 33948915UL, 33528495UL, 34879845UL,
    37011975UL, 37522485UL, 39564525UL, 41096055UL, 43648605UL, 44219175UL,
    45930885UL, 47222175UL, 48333285UL, 50075025UL, 51816765UL, 52777725UL,
    55390335UL, 55547415UL, 59053995UL, 60063465UL, 61906845UL, 64579515UL,
    66621555UL, 67492425UL, 70105035UL, 73258185UL, 74939865UL, 77224455UL,
    79594515UL, 81876795UL, 84999915UL, 88062975UL, 91005915UL, 94189095UL,
    98423325UL, 101846745UL, 111546435UL, 111035925UL, 115120005UL,
    121246125UL, 124098975UL, 130945815UL, 140645505UL, 150345195UL,
    150225075UL, 155450295UL, 158333175UL, 170255085UL, 179444265UL,
    190285095UL, 198843645UL, 203408205UL, 206831625UL, 217222005UL,
    229474245UL, 240705465UL, 252447195UL, 254999745UL, 269023755UL,
    282146865UL, 287672385UL, 294076965UL, 306110805UL, 318302985UL,
    334639305UL, 344338995UL, 354038685UL, 363738375UL, 373438065UL,
    387221835UL, 400254855UL, 421936515UL, 431636205UL, 451035585UL,
    453888435UL, 470434965UL, 480134655UL, 510765255UL, 522506985UL,
    557732175UL, 570855285UL, 596530935UL, 610224615UL, 627912285UL,
    654729075UL, 703227525UL, 722116395UL, 751725975UL, 780825045UL,
    790524735UL, 821665845UL, 851275425UL, 863017155UL, 909984075UL,
    936020085UL, 984518535UL, 1017041025UL, 1052416365UL
#if (ULONG_MAX > 4294967295)
   ,1086110025UL, 1110614505UL, 1147371225UL, 1191785595UL, 1213887675UL,
    1265809545UL, 1282356075UL, 1331995665UL, 1391905515UL, 1450103655UL,
    1479202725UL, 1547100555UL, 1555088535UL, 1673196525UL, 1712565855UL,
    1767130365UL, 1830673845UL, 1883166285UL, 1954487535UL, 2001964965UL,
    2119382265UL, 2187280095UL, 2255177925UL, 2342475135UL, 2390973585UL,
    2421213795UL, 2555868315UL, 2672264595UL, 2788660875UL, 2856558705UL,
    2953555605UL, 3050552505UL, 3234846615UL, 3457939485UL, 3516137625UL,
    3681032355UL, 3758629875UL, 3904125225UL, 4127218095UL, 4360010655UL,
    4573403835UL, 4796496705UL, 4844995155UL, 5019589575UL, 5203883685UL,
    5262081825UL, 5465775315UL, 5766465705UL, 5898837945UL, 6164152995UL,
    6358146795UL, 6411780375UL, 6804332535UL, 6980458485UL, 7172920755UL,
    7473611145UL, 7716103395UL, 7968295335UL, 8182259085UL, 8342499165UL,
    8812168365UL, 9023519505UL, 9704539845UL, 9927632715UL, 10373818455UL,
    10439434005UL, 10820004195UL, 11043097065UL, 11489282805UL,
    11877270405UL, 12381654285UL, 12604747155UL, 13080031965UL,
    13274025765UL, 13642613985UL, 14389490115UL, 14583483915UL,
    15058768725UL, 15611651055UL, 16174233075UL, 16397325945UL,
    17289697425UL, 17735883165UL, 18143270145UL, 18381678315UL,
    19074440385UL, 19559424885UL, 20636090475UL, 20941375455UL,
    21800053275UL, 22643926305UL, 23148310185UL, 24205576395UL,
    24546777255UL, 25544133615UL, 26389538175UL, 26863291455UL,
    27813861075UL, 29113619535UL, 29494189725UL, 30520074585UL,
    30684969315UL, 31790733975UL, 33575476935UL, 34467848415UL,
    35202742575UL, 36427185795UL, 38037334335UL, 39240095895UL,
    40365259935UL, 42053005995UL, 43168470345UL, 44953213305UL,
    45845584785UL, 48522699225UL, 50307442185UL, 51869092275UL,
    53653835235UL, 54546206715UL, 56680138515UL, 58784971245UL,
    59386352025UL, 61908271425UL, 63431122755UL, 65700850215UL,
    67931778915UL, 70162707615UL, 72616729185UL, 74120181135UL,
    75740029365UL, 78417143805UL, 80871165375UL, 82840202445UL,
    86448487125UL, 88466022645UL, 91133437395UL, 92918180355UL,
    100280245065UL, 100726430805UL, 102811864155UL, 106749938295UL,
    109000266375UL, 113219631525UL, 119689324755UL, 121027881975UL,
    127943760945UL, 132628711215UL, 134859639915UL, 141775518885UL,
    148691397855UL, 150922326555UL, 155607276825UL, 161320394235UL,
    164977177365UL, 171446870595UL, 177470378085UL, 183270792705UL
#endif
};

/* All the prime factors that can appear in eulerphi(P) */
const unsigned long phiPfactors[] = {2UL, 3UL, 5UL, 7UL, 11UL, 13UL,
				     17UL, 19UL};


/* Some useful PARI functions:
   sumset(a,b) = {local(i, j, l); l = listcreate (length(a) * length(b)); for (i = 1, length(a), for (j = 1, length(b), listput(l, a[i] + b[j]))); listsort (l, 1); l}

   V(i,X) = { if (i==0, return(2)); if (i==1, return(X)); if(i%2 == 0, return (V (i/2, X)^2-2)); return (V ((i+1)/2, X) * V ((i-1)/2, X) - X)}

   U(i,X) = { if (i==0, return(0)); if (i==1, return(1)); if(i%2 == 0, return (U (i/2, X) * V(i/2,X))); return (V ((i+1)/2, X)  *U( (i-1)/2, X) + 1)}
*/

#ifndef _OPENMP
static int omp_get_num_threads () {return 1;}
static int omp_get_thread_num () {return 0;}
#endif

static void
ntt_sqr_reciprocal (mpzv_t, const mpzv_t, mpzspv_t, const spv_size_t,
		    const mpzspm_t);

static void
print_elapsed_time (int verbosity, long cpu_start,
		    ATTRIBUTE_UNUSED long real_start)
{
#ifdef _OPENMP
  if (real_start != 0L)
    {
      outputf (verbosity, " took %lums (%lums real)\n",
	       elltime (cpu_start, cputime()),
	       elltime (real_start, realtime()));
      return;
    }
#endif
  outputf (verbosity, " took %lums\n", elltime (cpu_start, cputime()));
}


static void
print_CRT_primes (const int verbosity, const char *prefix,
		   const mpzspm_t ntt_context)
{
  double modbits = 0.;
  unsigned int i;

  if (test_verbose (verbosity))
    {
      outputf (verbosity, "%s%lu", prefix, ntt_context->spm[0]->sp);
      modbits += log ((double) ntt_context->spm[0]->sp);
      for (i = 1; i < ntt_context->sp_num; i++)
	{
	  outputf (verbosity, " * %lu", ntt_context->spm[i]->sp);
	  modbits += log ((double) ntt_context->spm[i]->sp);
	}
      outputf (verbosity, ", has %d primes, %f bits\n",
               ntt_context->sp_num, modbits / log (2.));
    }
}

/* Approximate amount of memory in bytes each coefficient in an NTT takes
   so that NTT can do transforms up to length lmax with modulus, or
   with 2*modulus if twice != 0 */
static size_t
ntt_coeff_mem (const unsigned long lmax, const mpz_t modulus, const int twice)
{
  mpz_t t;
  size_t n;

  mpz_init (t);
  mpz_mul (t, modulus, modulus);
  mpz_mul_ui (t, t, lmax);
  if (twice)
    mpz_mul_2exp (t, t, 1UL);
  /* +4: +1 for rounding up, +3 for extra words due to ECRT */
  n = (mpz_sizeinbase (t, 2) - 1) / SP_NUMB_BITS + 4;
  mpz_clear (t);
  return n * sizeof (sp_t);
}

size_t
pm1fs2_memory_use (const unsigned long lmax, const mpz_t modulus,
		   const int use_ntt)
{
  if (use_ntt)
    {
      /* We store lmax / 2 + 1 coefficients for the DCT-I of F and lmax
	 coefficients for G in NTT ready format. Each coefficient in
	 NTT-ready format occupies approx.
	 ceil(log(lmax*modulus^2)/log(bits per sp_t)) + 3 words. */

      size_t n;

      n = ntt_coeff_mem (lmax, modulus, 0) * (size_t) (3 * lmax / 2 + 1);
      outputf (OUTPUT_DEVVERBOSE, "pm1fs2_memory_use: Estimated memory use "
	       "with lmax = %lu NTT is %lu bytes\n", lmax, n);
      return n;
    }
  else
    {
      /* F stores s_1/2 residues,
	 h stores s_1 mpz_t structs (residues get cloned from F)
	 g stores lmax residues,
	 R stores lmax-s_1 residues,
	 and tmp stores 3*lmax+list_mul_mem (lmax / 2) residues.
	 Assume s_1 is close to lmax/2.
	 Then we have
	 lmax/4 + lmax/2 + lmax + lmax/2 + 3*lmax + list_mul_mem (lmax / 2)
	 = (5+1/4)*lmax + list_mul_mem (lmax / 2) residues, plus s_1 mpz_t.
      */

      size_t n;

      n = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
      n *= 5 * lmax + lmax / 4 + list_mul_mem (lmax / 2);
      n += lmax / 2 * sizeof (mpz_t);
      /* Memory use due to temp space allocation in TMulKS appears to
	 approximately triple the estimated memory use. This is hard to
	 estimate precisely, so let's go with the fudge factor of 3 here */
      n *= 3;
      outputf (OUTPUT_DEVVERBOSE, "pm1fs2_memory_use: Estimated memory use "
	       "with lmax = %lu is %lu bytes\n", lmax, n);
      return n;
    }
}

/* return the possible lmax for given memory use and modulus */

unsigned long
pm1fs2_maxlen (const size_t memory, const mpz_t modulus, const int use_ntt)
{
  if (use_ntt)
    {
      size_t n, lmax = 1;

      n = ntt_coeff_mem (lmax, modulus, 0);
      lmax = 1UL << ceil_log2 (memory / n / 3);
      return lmax;
    }
  else
    {
      size_t lmax, n;

      n = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);

      /* Guess an initial value of lmax for list_mul_mem (lmax / 2) */
      /* memory = n * 25/4 * lmax + lmax / 2 * sizeof (mpz_t); */
      /* Fudge factor of 3 for TMulKS as above */
      lmax = memory / (3 * 25 * n / 4 + 3 * sizeof (mpz_t) / 2);
      return lmax;
    }
}

size_t
pp1fs2_memory_use (const unsigned long lmax, const mpz_t modulus,
		   const int use_ntt, const int twopass)
{
  size_t n, m;

  m = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
  if (use_ntt)
    {
      /* In one pass mode, we store h_x_ntt and h_y_ntt, each of length
	 lmax/2(+1), and g_x_ntt and g_y_ntt, each of length lmax, all in
	 NTT ready format. In two pass mode, we store h_x_ntt, h_y_ntt and
	 g_x_ntt as before, plus R which is lmax - s_1 mpz_t.
	 We assume s_1 ~= lmax/2.
      */

      n = ntt_coeff_mem (lmax, modulus, !twopass);
      if (twopass)
	return lmax * (2 * n + m / 2);
      else
	return lmax * 3 * n;
    }
  else
    {
      /* We allocate:
	 F: s_1/2 coefficients
	 fh_x, fh_y: s_1/2 coefficients
	 h_x, h_y: s_1 mpz_t's (cloned from fh_x and fh_y)
	 g_x, g_y: lmax coefficients
	 R_x, R_y: lmax - s_1 coefficients
	 tmp: 3UL * lmax + list_mul_mem (lmax / 2)
	 Assuming s_1 ~ lmax/2, that's
	 lmax/2 + 2*lmax/4 + 2*lmax + 2*lmax/2 * 3*lmax +
           list_mul_mem (lmax / 2) =
	 7 + list_mul_mem (lmax / 2) coefficients and lmax mpz_t.
       */

      n = m * (7 * lmax + list_mul_mem (lmax / 2));
      n += lmax * sizeof (mpz_t);
      n = 5 * n / 2; /* A fudge factor again */
      return n;
    }
}

unsigned long
pp1fs2_maxlen (const size_t memory, const mpz_t modulus, const int use_ntt,
	       const int twopass)
{
  size_t n, m;

  m = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
  if (use_ntt)
    {
      n = ntt_coeff_mem (1, modulus, !twopass);
      if (twopass)
	n = memory / (2 * n + m / 2);
      else
	n = memory / (3 * n);
      return 1UL << (ceil_log2 (n / 2)); /* Rounded down to power of 2 */
    }
  else
    {
      return memory / 5 / (m * 8 + sizeof (mpz_t)) * 2;
    }
}


/* Test if for given P, nr, B2min and B2 we can choose an m_1 so that the
   stage 2 interval [B2min, B2] is covered. The effective B2min and B2
   are stored in effB2min and effB2 */

static int
test_P (const mpz_t B2min, const mpz_t B2, mpz_t m_1, const unsigned long P,
	const unsigned long nr, mpz_t effB2min, mpz_t effB2)
{
  mpz_t m;
  /* We need B2min >= 2 * max(S_1 + S_2) + (2*m_1 - 1)*P + 1, or
     B2min - 2 * max(S_1 + S_2) - 1 >= (2*m_1)*P - P, or
     (B2min - 2*max(S_1 + S_2) + P - 1)/(2P) >= m_1
     Choose m_1 accordingly */

  mpz_init (m);
  sets_max (m, P);
  mpz_mul_2exp (m, m, 1UL); /* m = 2*max(S_1 + S_2) */

  mpz_sub (m_1, B2min, m);
  mpz_sub_ui (m_1, m_1, 1UL); /* m_1 = B2min - 2*max(S_1 + S_2) - 1 */
  mpz_add_ui (m_1, m_1, P);
  mpz_fdiv_q_2exp (m_1, m_1, 1UL);
  mpz_fdiv_q_ui (m_1, m_1, P);    /* 2UL*P may overflow */

  /* Compute effB2min = 2 * max(S_1 + S_2) + (2*(m_1 - 1) + 1)*P + 1 */

  mpz_mul_2exp (effB2min, m_1, 1UL);
  mpz_sub_ui (effB2min, effB2min, 1UL);
  mpz_mul_ui (effB2min, effB2min, P);
  mpz_add (effB2min, effB2min, m);
  mpz_add_ui (effB2min, effB2min, 1UL);
  ASSERT_ALWAYS (mpz_cmp (effB2min, B2min) <= 0);

  /* Compute the smallest value coprime to P at the high end of the stage 2
     interval that will not be covered:
     2*(min(S_1 + S_2)) + (2*(m_1 + nr) + 1)*P.
     We assume min(S_1 + S_2) = -max(S_1 + S_2) */
  mpz_add_ui (effB2, m_1, nr);
  mpz_mul_2exp (effB2, effB2, 1UL);
  mpz_add_ui (effB2, effB2, 1UL);
  mpz_mul_ui (effB2, effB2, P);
  mpz_sub (effB2, effB2, m);

  /* The effective B2 values is that value, minus 1 */
  mpz_sub_ui (effB2, effB2, 1UL);

  mpz_clear (m);
  return (mpz_cmp (B2, effB2) <= 0);
}


static void
factor_phiP (int *exponents, const unsigned long phiP)
{
    const int nrprimes = sizeof (phiPfactors) / sizeof (unsigned long);
    unsigned long cofactor = phiP;
    int i;

    ASSERT_ALWAYS (phiP > 0UL);

    for (i = 0; i < nrprimes; i++)
	for (exponents[i] = 0; cofactor % phiPfactors[i] == 0UL; exponents[i]++)
	    cofactor /= phiPfactors[i];

    ASSERT_ALWAYS (cofactor == 1UL);
}


static unsigned long
pow_ul (const unsigned long b, const unsigned int e)
{
    unsigned long r = 1UL;
    unsigned int i;

    for (i = 0; i < e; i++)
	r *= b;

    return r;
}

static unsigned long
absdiff_ul (unsigned long a, unsigned long b)
{
    return (a > b) ? a - b : b - a;
}

/* Choose s_1 so that s_1 * s_2 = phiP, s_1 is positive and even,
   s_2 >= min_s2 and s_2 is minimal and abs(s_1 - l) is minimal
   under those conditions. If use_ntt == 1, we require s_1 < l.
   Returns 0 if no such choice is possible */

static unsigned long
choose_s_1 (const unsigned long phiP, const unsigned long min_s2,
	    const unsigned long l, const int use_ntt)
{
  const int nrprimes = sizeof (phiPfactors) / sizeof (unsigned long);
  /* Using [nrprimes] here makes the compiler complain about variable-sized
     arrays */
  int phiPexponents[sizeof (phiPfactors) / sizeof (unsigned long)],
    exponents[sizeof (phiPfactors) / sizeof (unsigned long)];
  unsigned long s_1 = 0UL, s_2 = 0UL, trys_1;
  int i;

  ASSERT_ALWAYS (phiP > 0 && phiP % 2 == 0);

  /* We want only even s_1. We divide one 2 out of phiP here... */
  factor_phiP (phiPexponents, phiP / 2);
  for (i = 0; i < nrprimes; i++)
      exponents[i] = 0;

  do {
      trys_1 = 2; /* ... and add a 2 here */
      for (i = 0; i < nrprimes; i++)
	  trys_1 *= pow_ul (phiPfactors[i], exponents[i]);
#if 0
      printf ("choose_s_1: Trying trys_1 = %lu\n", trys_1);
#endif
      /* See if it satisfies all the required conditions and is an
	 improvement over the previous choice */
      if (phiP / trys_1 >= min_s2 &&
	  (s_2 == 0UL || phiP / trys_1 < s_2) &&
	  absdiff_ul (trys_1, l) < absdiff_ul (s_1, l) &&
	  (use_ntt == 0 || trys_1 < l))
      {
#if 0
	  printf ("choose_s_1: New best s_1 for phiP = %lu, min_s2 = %lu, "
		  "l = %lu : %lu\n", phiP, min_s2, l, trys_1);
#endif
	  s_1 = trys_1;
      }
      for (i = 0; i < nrprimes; i++)
      {
	  if (++(exponents[i]) <= phiPexponents[i])
	      break;
	  exponents[i] = 0;
      }
  } while (i < nrprimes);

  return s_1;
}


/* Approximate cost of stage 2. Cost with and without ntt are not
   comparable. We have l > s_1 and s_1 * s_2 = eulerphi(P), hence
   s_2*l > eulerphi(P) and so cost (s_2, l) > eulerphi(P) for all P */
static unsigned long
est_cost (const unsigned long s_2, const unsigned long l, const int use_ntt,
          const int method)
{
  if (method == ECM_PM1)
    {
      /* The time for building f, h and DCT-I of h seems to be about
         7/6 of the time of computing g, h*g and gcd with NTT, and
         3/2 of the time of computing g, h*g and gcd without NTT */

      if (use_ntt)
        return (7 * l) / 6 + s_2 * l;
      else
        return (3 * l) / 2 + s_2 * l;
    }
  else if (method == ECM_PP1)
    {
      /* Building f is the same, building h and its forward transform is
         twice about as expensive as for P-1. Each multi-point evaluation
         is twice as expensive as for P-1.
         FIXME: The estimate for NTT assumes the "one-pass" variant, in
         "two-pass" the multipoint evaluations are slower, so the optimum
         shifts towards smaller s_2 some more */
      if (use_ntt)
        return (4 * l) / 5 + s_2 * l;
      else
        return (3 * l) / 4 + s_2 * l;
    }
  else
    abort (); /* Invalid value for method */
}

/* Choose P so that a stage 2 range from B2min to B2 can be covered with
   multipoint evaluations, each using a convolution of length at most lmax.
   The parameters for stage 2 are stored in finalparams, the final effective
   B2min and B2 values in final_B2min and final_B2, respecively. Each of these
   may be NULL, in which case the value is not stored. It is permissible
   to let B2min and final_B2min, or B2 and final_B2 point at the same mpz_t. */

long
choose_P (const mpz_t B2min, const mpz_t B2, const unsigned long lmax,
	  const unsigned long min_s2, faststage2_param_t *finalparams,
	  mpz_t final_B2min, mpz_t final_B2, const int use_ntt,
	  const int method)
{
  /* Let S_1 + S_2 == (Z/PZ)* (mod P).

     Let F(x) = \prod_{k_1 \in S_1} (x - b_1^{2 k_1}).

     If we evaluate F(b_1^{2 k_2 + (2m + 1)P}) for all k_2 \in S_2 with
     m_1 <= m < m_1+nr, we test all exponents 2 k_2 + (2m + 1)P - 2 k_1.
     The largest value coprime to P at the low end of the stage 2 interval
     *not* covered will be
       2*max(S_2) + (2*(m_1-1) + 1)*P - 2*min(S_1).
     The smallest value at the high end not covered will be
       2*min(S_2) + (2*(m_1 + nr) + 1)*P - 2*max(S_1).
     Assume S_1 and S_2 are symmetric around 0, so that max(S_1) = -min(S_1).
     Then the largest ... is:
       2*(max(S_1) + max(S_2)) + (2*m_1 - 1)*P
     The smallest ... is:
       -2*(max(S_1) + max(S_2)) + (2*m_1 + 2*nr + 1)*P
     The effective B2min = 2*(max(S_1) + max(S_2)) + (2*m_1 - 1)*P + 1
     The effective B2max = -2*(max(S_1) + max(S_2)) + (2*m_1 + 2*nr + 1)*P - 1

     Then the difference effB2max - effB2min =
       -4*(max(S_1) + max(S_2)) + 2P*(nr + 1) - 2

     We obviously require B2max - B2min <= 2*nr*P
     Since nr < lmax, B2max - B2min <= 2*lmax*P or
     P >= ceil((B2max - B2min)/(2*lmax))

     Hence we are looking for an odd P with s_1 * s_2 = eulerphi(P) so that
     s_1 ~= lmax / 2 and the whole stage 2 interval is covered. s_2 should
     be small, as long as s_1 is small enough.
  */

  mpz_t B2l, m_1, effB2min, tryeffB2, effB2, lmin;
  /* The best parameters found so far, P == 0 means that no suitable P
     has been found yet: */
  unsigned long P = 0, s_1 = 0, s_2 = 0, l = 0, cost = 0;
  unsigned int i;
  const unsigned int Pvalues_len = sizeof (Pvalues) / sizeof (unsigned long);
  int r;

  outputf (OUTPUT_TRACE,
           "choose_P(B2min = %Zd, B2 = %Zd, lmax = %lu, min_s2 = %ld, "
           "use_ntt = %d, method = %d\n",
           B2min, B2, lmax, min_s2, use_ntt, method);

  if (mpz_cmp (B2, B2min) < 0)
    return 0L;

  /* If we use the NTT, we allow only power-of-two transform lengths.
     In that case, the code below assumes that lmax is a power of two.
     If that is not the case, print error and return. */
  if (use_ntt && (lmax & (lmax - 1UL)) != 0)
    {
      outputf (OUTPUT_ERROR,
               "choose_P: Error, lmax = %lu is not a power of two\n", lmax);
      return ECM_ERROR;
    }

  mpz_init (effB2);
  mpz_init (tryeffB2);
  mpz_init (effB2min);
  mpz_init (B2l);
  mpz_init (m_1);
  mpz_init (lmin);

  mpz_sub (B2l, B2, B2min);
  mpz_add_ui (B2l, B2l, 1UL); /* +1 due to closed interval */

  /* For each candidate P, check if [B2min, B2] can be covered at all,
     and if so, what the best parameters (minimizing the cost, maximizing
     effB2) are. If they are better than the best parameters for the best P
     so far, remember them. */

  for (i = 0 ; i < Pvalues_len; i++)
    {
      unsigned long tryP, tryphiP, trys_1, trys_2, tryl, trycost;

      tryP = Pvalues[i];
      tryphiP = eulerphi (tryP);

      outputf (OUTPUT_TRACE,
	       "choose_P: trying P = %lu, eulerphi(P) = %lu\n", tryP, tryphiP);

      /* If we have a good P already and this tryphiP >= cost, then
	 there's no hope for this tryP, since cost(s_2, l) > eulerphi(P) */
      if (P != 0 && tryphiP >= cost)
	{
	  outputf (OUTPUT_TRACE,
		   "choose_P: tryphiP > cost = %lu, this P is too large\n",
		   cost);
	  continue;
	}

      /* We have nr < l and effB2-effB2min <= 2*nr*P. Hence we need
	 l >= B2l/P/2 */
      mpz_cdiv_q_ui (lmin, B2l, tryP);
      mpz_cdiv_q_2exp (lmin, lmin, 1UL);
      outputf (OUTPUT_TRACE, "choose_P: lmin = %Zd for P = %lu\n", lmin, tryP);
      if (mpz_cmp_ui (lmin, lmax) > 0)
	{
	  outputf (OUTPUT_TRACE,
		   "choose_P: lmin > lmax, this P is too small\n");
	  continue;
	}

      /* Try all possible transform lengths and store parameters in
	 P, s_1, s_2, l if they are better than the previously best ones */

      /* Keep reducing tryl to find best parameters. For NTT, we only have
	 power of 2 lengths so far, so we can simply divide by 2.
	 For non-NTT, we have arbitrary transform lengths so we can decrease
	 in smaller steps... let's say by, umm, 25% each time? */
      for (tryl = lmax; mpz_cmp_ui (lmin, tryl) <= 0;
	   tryl = (use_ntt) ? tryl / 2 : 3 * tryl / 4)
	{
	  trys_1 = choose_s_1 (tryphiP, min_s2, tryl / 2, use_ntt);
	  if (trys_1 == 0)
	    {
	      outputf (OUTPUT_TRACE,
		       "choose_P: could not choose s_1 for P = %lu, l = %lu\n",
		       tryP, tryl);
	      continue;
	    }
	  ASSERT (tryphiP % trys_1 == 0UL);
	  trys_2 = tryphiP / trys_1;
	  outputf (OUTPUT_TRACE, "choose_P: chose s_1 = %lu, k = s_2 = %lu "
		   "for P = %lu, l = %lu\n", trys_1, trys_2, tryP, tryl);

	  if (test_P (B2min, B2, m_1, tryP, tryl - trys_1, effB2min, tryeffB2))
	    {
	      outputf (OUTPUT_TRACE,
		       "choose_P: P = %lu, l = %lu, s_1 = %lu, k = s_2 = %lu "
		       "works, m_1 = %Zd, effB2min = %Zd, effB2 = %zZd\n",
		       tryP, tryl, trys_1, trys_2, m_1, effB2min, tryeffB2);

          /* do not allow B2min to be less than 0 */
          if (mpz_cmp_ui(effB2min, 0) < 0)
            continue;

	      /* We use these parameters if we
		 1. didn't have any suitable ones yet, or
		 2. these cover [B2min, B2] and are cheaper than the best
                    ones so far, or
		 3. they are as expensive but reach greater effB2. */
	      trycost = est_cost (trys_2, tryl, use_ntt, method);
	      ASSERT (tryphiP < trycost);
	      if (P == 0 || trycost < cost ||
		  (trycost == cost && mpz_cmp (tryeffB2, effB2) > 0))
		{
		  outputf (OUTPUT_TRACE,
			   "choose_P: and is the new optimum (cost = %lu)\n",
			   trycost);
		  P = tryP;
		  s_1 = trys_1;
		  s_2 = trys_2;
		  l = tryl;
		  cost = trycost;
		  mpz_set (effB2, tryeffB2);
		}
	    }
	}
  }

  if (P != 0) /* If we found a suitable P */
    {
      /* Compute m_1, effB2min, effB2 again */
      r = test_P (B2min, B2, m_1, P, l - s_1, effB2min, effB2);
      ASSERT_ALWAYS(r != 0);
      if (finalparams != NULL)
	{
	  finalparams->P = P;
	  finalparams->s_1 = s_1;
	  finalparams->s_2 = s_2;
	  finalparams->l = l;
	  mpz_set (finalparams->m_1, m_1);
	}
      if (final_B2min != NULL)
	mpz_set (final_B2min, effB2min);
      if (final_B2 != NULL)
	mpz_set (final_B2, effB2);
    }

  mpz_clear (effB2);
  mpz_clear (tryeffB2);
  mpz_clear (effB2min);
  mpz_clear (B2l);
  mpz_clear (m_1);
  mpz_clear (lmin);

  return (P != 0) ? (long) P : ECM_ERROR;
}


static void
list_output_poly (listz_t l, unsigned long len, int monic, int symmetric,
		  char *prefix, char *suffix, int verbosity)
{
  unsigned long i;

  if (prefix != NULL)
    outputf (verbosity, prefix);

  if (len == 0)
    {
      if (monic)
	outputf (verbosity, "1\n", len, len);
      else
	outputf (verbosity, "0\n", len);
      return;
    }

  if (monic)
    {
      if (symmetric)
	outputf (verbosity, "(x^%lu + x^-%lu) + ", len, len);
      else
	outputf (verbosity, "x^%lu + ", len);
    }
  for (i = len - 1; i > 0; i--)
    if (symmetric)
      outputf (verbosity, "%Zd * (x^%lu + x^-%lu) + ", l[i], i, i);
    else
      outputf (verbosity, "%Zd * x^%lu + ", l[i], i);
  outputf (verbosity, "%Zd", l[0]);
  if (suffix != NULL)
    outputf (verbosity, suffix);
}

#if 0
/* Multiply P[i] by r^{k(deg-i)}, for 0 <= i <= deg. Needs 3 entries in tmp. */
/* I.e., let P(x) = x^deg + \sum_{i=0}^{deg - 1} P[i] * x^i. The output is
   R(x) = x^deg + \sum_{i=0}^{deg - 1} R[i] * x^i = r^(k deg) P(r^{-k} x). */
/* The input and output polynomials are monic and have the leading monomial
   implicit, i.e. not actually stored in the array of coefficients. */
/* Returns 0 if a modular inversion failed (in which case R is left
   unchanged), 1 otherwise */

static int
list_scale_rev (listz_t R, listz_t S, mpz_t r, long k, unsigned long deg,
		mpz_t modulus, listz_t tmp,
		ATTRIBUTE_UNUSED const unsigned long tmplen)
{
  unsigned long i;

  ASSERT (tmplen >= 3);
  mpz_powm_ui (tmp[0], r, (unsigned long) labs (k), modulus);
  if (k < 0)
    {
      if (!mpz_invert (tmp[0], tmp[0], modulus)) /* FIXME: get rid of this! */
	return 0;
    }
  /* Here, tmp[0] = r^k */
  mpz_set (tmp[1], tmp[0]);
  /* mpz_set (R[deg], S[deg]); Leading monomial is not stored! */
  for (i = 1; i + 1 <= deg; i++)
    {
      /* Here, tmp[1] = r^(ki) */
      mpz_mul (tmp[2], S[deg-i], tmp[1]);
      mpz_mod (R[deg-i], tmp[2], modulus);
      mpz_mul (tmp[2], tmp[1], tmp[0]);  /* FIXME, avoid unnecessary mul */
      mpz_mod (tmp[1], tmp[2], modulus); /* at end of loop */
    }
  if (i <= deg)
    {
      mpz_mul (tmp[2], S[deg-i], tmp[1]);
      mpz_mod (R[deg-i], tmp[2], modulus);
    }

  return 1;
}
#endif

/* Same, but does squaring which makes things easier */

static void
list_sqr_reciprocal (listz_t R, listz_t S, const unsigned long l,
		     mpz_t modulus, listz_t tmp,
		     ATTRIBUTE_UNUSED const unsigned long tmplen)
{
  unsigned long i;
  listz_t Srev, r1 = tmp, r2 = tmp + 2 * l - 1, t = tmp + 4 * l - 2;

  if (l == 0UL)
    return;

  /* FIXME: This modifies the input arguments. */
  /* We have to divide S[0] by 2 */

  ASSERT (tmplen >= 4 * l - 2 + list_mul_mem (l));

#if 0
  gmp_printf ("/* list_sqr_reciprocal */ S(x) = %Zd", S[0]);
  for (i = 1; i < l1; i++)
    gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S[i], i, i);
  gmp_printf ("\n");
#endif

  if (mpz_odd_p (S[0]))
    {
      ASSERT_ALWAYS (mpz_odd_p (modulus));
      mpz_add (S[0], S[0], modulus);
    }
  mpz_tdiv_q_2exp (S[0], S[0], 1UL);

  list_mul (r1, S, l, S, l, 0, t);
  /* r1 = f0*g0/4 + (f0*g1 + f1*g0)/2 * x + f1*g1 * x^2 */
#if 0
  for (i = 0; i < 2UL * l - 1UL; i++)
    gmp_printf ("list_sqr_reciprocal: r1[%lu] = %Zd\n", i, r1[i]);
#endif

  Srev = (listz_t) malloc (l * sizeof (mpz_t));
  ASSERT_ALWAYS (Srev != NULL);
  for (i = 0UL; i < l; i++)
      (*Srev)[i] = (*S)[l - 1UL - i];
  list_mul (r2, S, l, Srev, l, 0, t);
  /* r2 is symmetric, r2[i] = r2[2*l - 2 - i]. Check this */
#if 0
  for (i = 0; 0 && i < 2UL * l - 1UL; i++)
    gmp_printf ("list_sqr_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
#endif
#ifdef WANT_ASSERT
  for (i = 0UL; i < l; i++)
    ASSERT (mpz_cmp (r2[i], r2[2UL * l - 2UL - i]) == 0);
#endif
  free (Srev);
  /* r2 = g1*f0/2 + (g0*f0/4 + g1*f1) * x + g0*f1/2 * x^2 */
#if 0
  for (i = 0; i < 2UL * l - 1UL; i++)
    gmp_printf ("list_sqr_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
#endif

  mpz_mul_2exp (r1[0], r1[0], 1UL);
  /* r1 = f0*g0/2 + (f0*g1 + f1*g0)/2 * x + f1*g1 * x^2 */
  for (i = 0UL; i < l; i++)
    {
      mpz_mul_2exp (r2[l - i - 1UL], r2[l - i - 1UL], 1UL);
      mpz_add (R[i], r1[i], r2[l - i - 1UL]);
    }
  /* r1 = 3/4*f0*g0 + g1*f1 + (f0*g1 + 2*f1*g0)/2 * x + f1*g1 * x^2 */
  /* r1 = f0*g0 + 2*g1*f1 + (f0*g1 + f1*g0) * x + f1*g1 * x^2 */
  for (i = l; i < 2UL * l - 1UL; i++)
      mpz_set (R[i], r1[i]);

  if (R != S)
    mpz_mul_2exp (S[0], S[0], 1UL);

#if 0
  for (i = 0; i < 2UL * l; i++)
    gmp_printf ("list_sqr_reciprocal: R[%lu] = %Zd\n", i, R[i]);
#endif
}

#ifdef WANT_ASSERT
static void
list_recip_eval1 (mpz_t R, const listz_t S, const unsigned long l)
{
  unsigned long i;

  mpz_set_ui (R, 0UL);
  for (i = 1; i < l; i++)
    mpz_add (R, R, S[i]);
  mpz_mul_2exp (R, R, 1UL);
  if (l > 0UL)
    mpz_add (R, R, S[0]);
}
#endif

/* Multiply two reciprocal polynomials of degree 2*l1-2 and 2*l2-2, resp.,
   with coefficients in standard basis

   S_1(x) = S1[0] + sum_{1 \leq i \leq l1 - 1} S1[i] (x^i + x^{-i})
   S_2(x) = S2[0] + sum_{1 \leq i \leq l2 - 1} S2[i] (x^i + x^{-i})

   to the reciprocal polynomial of degree 2*(l1 + l2) - 4

   R(x) = R[0] + sum_{1 \leq i \leq l1 + l2 - 2} R[i] (x^i + x^{-i})
        = S_1(x) * S_2(x)

   R == S1 == S2 is permissible, however if S1 == S2, l1 must be equal
   to l2 (i.e. the multiplication must be a squaring)
*/
  /* FIXME: This modifies the input arguments. */
  /* We have to divide S1[0] and S2[0] by 2 */

static void
list_mul_reciprocal (listz_t R, listz_t S1, unsigned long l1,
		     listz_t S2, unsigned long l2,
		     mpz_t modulus, listz_t tmp,
		     ATTRIBUTE_UNUSED const unsigned long tmplen)
{
  unsigned long i;
  const unsigned long lmax = MAX(l1, l2);
  listz_t r1 = tmp, r2 = tmp + 2*lmax - 1, rev = tmp + 4*lmax - 2,
    t = tmp + 6*lmax - 3;
#ifdef WANT_ASSERT
  mpz_t sum1, sum2, prod;
#endif

  ASSERT (S1 < tmp || S1 >= tmp + tmplen);
  ASSERT (S2 < tmp || S2 >= tmp + tmplen);
  ASSERT (R < tmp || R >= tmp + tmplen);

  if (l1 == 0UL || l2 == 0UL)
    return;

  if (S1 == S2)
    {
      ASSERT_ALWAYS (l1 == l2);
      list_sqr_reciprocal (R, S1, l1, modulus, tmp, tmplen);
      return;
    }

  ASSERT (tmplen >= 6*lmax - 3 + list_mul_mem (lmax));
#ifdef WANT_ASSERT
  mpz_init (sum1);
  mpz_init (sum2);
  mpz_init (prod);
  list_recip_eval1 (sum1, S1, l1);
  list_recip_eval1 (sum2, S2, l2);
  mpz_mul (prod, sum1, sum2);
  mpz_mod (prod, prod, modulus);
#endif


  /* Make S1 the longer of the two, i.e. l1 >= l2 */
  if (l2 > l1)
    {
      listz_t St = S1;
      unsigned long lt = l1;
      S1 = S2;
      S2 = St;
      l1 = l2;
      l2 = lt;
    }

#if 0
  gmp_printf ("/* list_mul_reciprocal */ S1(x) = %Zd", S1[0]);
  for (i = 1; i < l1; i++)
    gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S1[i], i, i);
  gmp_printf ("\n");
  gmp_printf ("/* list_mul_reciprocal */ S2(x) = %Zd", S2[0]);
  for (i = 1; i < l1; i++)
    gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S2[i], i, i);
  gmp_printf ("\n");
#endif

  /* Divide S1[0] and S2[0] by 2 */
  if (mpz_odd_p (S1[0]))
    {
      ASSERT_ALWAYS (mpz_odd_p (modulus));
      mpz_add (S1[0], S1[0], modulus);
    }
  mpz_tdiv_q_2exp (S1[0], S1[0], 1UL);

  if (mpz_odd_p (S2[0]))
    {
      ASSERT_ALWAYS (mpz_odd_p (modulus));
      mpz_add (S2[0], S2[0], modulus);
    }
  mpz_tdiv_q_2exp (S2[0], S2[0], 1UL);

  /* Pad rev with zeros */
  for (i = l2; i < lmax; i++)
    mpz_set_ui (rev[i], 0UL);

  for (i = 0UL; i < l2; i++)
    mpz_set (rev[i], S2[l2 - 1UL - i]);
  list_mul (r1, S1, lmax, rev, lmax, 0, t);
  /* r1 = \tilde{f}(x) \rev(\tilde{g}(x)) and has degree l1 + l2 - 2,
     i.e. l1 + l2 - 1 entries. */
#if 0
  for (i = 0; i < 2 * lmax - 1; i++)
    gmp_printf ("list_mul_reciprocal: r1[%lu] = %Zd\n", i, r1[i]);
#endif

  for (i = 0UL; i < l2; i++)
    mpz_set(rev[i], S2[i]);
  list_mul (r2, S1, lmax, rev, lmax, 0, t);
  /* \tilde{f}(x) \tilde{g}(x) */

#if 0
  for (i = 0; i < 2 * lmax - 1; i++)
    gmp_printf ("list_mul_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
#endif

  /* Add f_0*g_0 by doubling the f_0*g_0 term in r2 */
  mpz_mul_2exp (r2[0], r2[0], 1UL);

  /* Add \flloor x^{-d_g} \tilde{f}(x) \rev(\tilde{g}(x)) \rfloor.
     d_g = l2 - 1. */
  for (i = 0; i < l1; i++)
    mpz_add (r2[i], r2[i], r1[i + l2 - 1]);

  /* Add \floor x^{-d_f} rev(\tilde{f}(x) \rev(\tilde{g}(x))) \rfloor.
     d_f = l1 - 1. rev(r2)[i] = r2[l1 + l2 - 2 - i]. We want
     rev(r2)[l1 - 1 ... l1 + l2 - 2], hence
     r2[l2 - 1 ... 0] */
  for (i = 0; i < l2; i++)
    mpz_add (r2[i], r2[i], r1[l2 - 1 - i]);

#if 0
  for (i = 0; i < l1 + l2 - 1; i++)
    gmp_printf ("list_mul_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
#endif

  mpz_mul_2exp (S1[0], S1[0], 1UL);
  mpz_mul_2exp (S2[0], S2[0], 1UL);

  for (i = 0; i < l1 + l2 - 1; i++)
    mpz_set (R[i], r2[i]);

#if 0
  for (i = 0; i < l1 + l2 - 1; i++)
    gmp_printf ("list_mul_reciprocal: R[%lu] = %Zd\n", i, R[i]);
#endif
#ifdef WANT_ASSERT
  list_recip_eval1 (sum1, R, l1 + l2 - 1);
  mpz_mod (sum1, sum1, modulus);
  ASSERT (mpz_cmp (prod, sum1) == 0);
  mpz_clear (sum1);
  mpz_clear (sum2);
  mpz_clear (prod);
#endif
}

/*
  Computes V_k(S), where the Chebyshev polynomial V_k(X) is defined by
  V_k(X + 1/X) = X^k + 1/X^k
*/

static void
V (mpres_t R, const mpres_t S, const long k, mpmod_t modulus)
{
  mpres_t V0, Vi, Vi1;
  unsigned long j, uk;
  int po2;

  if (k == 0L)
    {
      mpres_set_ui (R, 2UL, modulus);
      return;
    }

  uk = labs (k);

  if (uk == 1UL)
    {
      mpres_set (R, S, modulus);
      return;
    }

  for (po2 = 0; uk % 2UL == 0UL; uk >>= 1, po2++);

  mpres_init (V0, modulus);
  mpres_set_ui (V0, 2UL, modulus); /* V0 = V_0(S) = 2 */

  if (uk == 1UL)
    {
      mpres_set (R, S, modulus);
      while (po2-- > 0)
        {
          mpres_sqr (R, R, modulus);
          mpres_sub (R, R, V0, modulus);
        }
      mpres_clear (V0, modulus);
      return;
    }

  if (0)
    {
      mpz_t tz;
      mpz_init (tz);
      mpres_get_z (tz, S, modulus);
      gmp_printf ("Chebyshev_V(%ld, Mod(%Zd,N)) == ", k, tz);
      mpz_clear (tz);
    }

  for (j = 1UL; j <= uk / 2UL; j <<= 1);

  mpres_init (Vi, modulus);
  mpres_init (Vi1, modulus);

  /* i = 1. Vi = V_i(S), Vi1 = V_{i+1}(S) */
  mpres_set (Vi, S, modulus);
  mpres_sqr (Vi1, S, modulus);
  mpres_sub (Vi1, Vi1, V0, modulus);
  j >>= 1;

  while (j > 1)
    {
      if ((uk & j) != 0UL)
	{
	  /* i' = 2i + 1.
	     V_{i'} = V_{2i + 1} = V_{i+1 + i} = V_{i+1} * V_{i} - V_1
	     V_{i'+1} = V_{2i + 2} = {V_{i+1}}^2 - V_0. */
	  mpres_mul (Vi, Vi, Vi1, modulus);
	  mpres_sub (Vi, Vi, S, modulus);
	  mpres_sqr (Vi1, Vi1, modulus);
	  mpres_sub (Vi1, Vi1, V0, modulus);
	}
      else
	{
	  /* i' = 2i.
	     V_{i'} = V_{2i} = {V_i}^2 - V0.
	     V_{i'+1} = V_{2i + 1} = V_{i+1 + i} = V_{i+1} * V_{i} - V_1 */
	  mpres_mul (Vi1, Vi, Vi1, modulus);
	  mpres_sub (Vi1, Vi1, S, modulus);

	  mpres_sqr (Vi, Vi, modulus);
	  mpres_sub (Vi, Vi, V0, modulus);
	}
      j >>= 1;
    }

  /* Least significant bit of uk is always 1 */
  mpres_mul (Vi, Vi, Vi1, modulus);
  mpres_sub (Vi, Vi, S, modulus);

  while (po2-- > 0)
    {
      mpres_sqr (Vi, Vi, modulus);
      mpres_sub (Vi, Vi, V0, modulus);
    }

  mpres_set (R, Vi, modulus);

  mpres_clear (Vi, modulus);
  mpres_clear (Vi1, modulus);
  mpres_clear (V0, modulus);

  if (0)
    {
      mpz_t tz;
      mpz_init (tz);
      mpres_get_z (tz, R, modulus);
      gmp_printf ("%Zd\n", tz);
      mpz_clear (tz);
    }
}

/*
  Computes U_k(S), where the Chebyshev polynomial U_k(X) is defined by
  U_k(X + 1/X) = (X^k - 1/X^k) / (X - 1/X)
  If R1 != NULL, stores U_{k+1}(S) there
*/

static void
U (mpres_t R, mpres_t R1, const mpres_t S, const long k, mpmod_t modulus)
{
  mpres_t V0, Vi, Vi1, Ui, Ui1, t;
  unsigned long j, uk;

  if (k == 0L)
    {
      mpres_set_ui (R, 0UL, modulus); /* U_0 = 0 */
      if (R1 != NULL)
	mpres_set_ui (R1, 1UL, modulus); /* U_1 = 1 */
      return;
    }

  uk = labs (k);

  if (uk == 1UL)
    {
      mpres_set_ui (R, 1UL, modulus);
      if (k == -1)
	mpres_neg (R, R, modulus);

      if (R1 != NULL)
	{
	  if (k == -1)
	    mpres_set_ui (R1, 0UL, modulus);
	  else
	    mpres_set (R1, S, modulus); /* U_2(S) = S */
	}

      return;
    }

  if (0)
    {
      mpz_t tz;
      mpz_init (tz);
      mpres_get_z (tz, S, modulus);
      gmp_printf ("Chebyshev_U(%ld, Mod(%Zd,N)) == ", k, tz);
      mpz_clear (tz);
    }

  mpres_init (V0, modulus);
  mpres_init (Vi, modulus);
  mpres_init (Vi1, modulus);
  mpres_init (Ui, modulus);
  mpres_init (Ui1, modulus);
  mpres_init (t, modulus);

  for (j = 1UL; j <= uk / 2UL; j <<= 1);

  mpres_set_ui (Ui, 1UL, modulus);   /* Ui = U_1(S) = 1 */
  mpres_set (Ui1, S, modulus);       /* Ui1 = U_2(S) = S */
  mpres_add (V0, Ui, Ui, modulus);   /* V0 = V_0(S) = 2 */
  mpres_set (Vi, S, modulus);        /* Vi = V_1(S) = S */
  mpres_sqr (Vi1, Vi, modulus);
  mpres_sub (Vi1, Vi1, V0, modulus); /* Vi1 = V_2(S) = S^2 - 2 */
  j >>= 1; /* i = 1 */

  while (j != 0)
    {
      if ((uk & j) == 0UL)
	{
	  mpres_mul (Vi1, Vi1, Vi, modulus);
	  mpres_sub (Vi1, Vi1, S, modulus); /* V_{2i+1} = V_{i+1} V_i - V_1 */
	  /* U_{2i+1} = (U_{i+1} + U_i) (U_{i+1} - U_i) */
	  mpres_sub (t, Ui1, Ui, modulus);
	  mpres_add (Ui1, Ui1, Ui, modulus);
	  mpres_mul (Ui1, Ui1, t, modulus);
	  mpres_mul (Ui, Ui, Vi, modulus); /* U_{2n} = U_n V_n */
	  mpres_sqr (Vi, Vi, modulus);
	  mpres_sub (Vi, Vi, V0, modulus); /* V_{2n} = V_n^2 - 2 */
	}
      else
	{
	  /* U_{2i+1} = (U_{i+1} + U_i) (U_{i+1} - U_i) */
	  mpres_sub (t, Ui1, Ui, modulus);
	  mpres_add (Ui, Ui, Ui1, modulus);
	  mpres_mul (Ui, Ui, t, modulus);
	  mpres_mul (Ui1, Ui1, Vi1, modulus); /* U_{2n+2} = U_{n+1} V_{n+1} */
	  mpres_mul (Vi, Vi, Vi1, modulus);
	  mpres_sub (Vi, Vi, S, modulus); /* V_{2i+1} = V_{i+1} V_i - V_1 */
	  mpres_sqr (Vi1, Vi1, modulus);
	  mpres_sub (Vi1, Vi1, V0, modulus); /* V_{2n+2} = V_{n+1}^2 - 2 */
	}
      j >>= 1;
    }

  if (k > 0)
    mpres_set (R, Ui, modulus);
  else
    mpres_neg (R, Ui, modulus);

  if (R1 != NULL)
    {
      /* Here k != -1,0,1, so k+1 is negative iff k is */
      if (k > 0)
	mpres_set (R1, Ui1, modulus);
      else
	mpres_neg (R1, Ui1, modulus);
    }

  mpres_clear (V0, modulus);
  mpres_clear (Vi, modulus);
  mpres_clear (Vi1, modulus);
  mpres_clear (Ui, modulus);
  mpres_clear (Ui1, modulus);
  mpres_clear (t, modulus);

  if (0)
    {
      mpz_t tz;
      mpz_init (tz);
      mpres_get_z (tz, R, modulus);
      gmp_printf ("%Zd\n", tz);
      mpz_clear (tz);
    }
}


/* Set R[i] = V_{i+k}(Q) * F[i] or U_{i+k}(Q) * F[i], for 0 <= i < len
   We compute V_{i+k+1}(Q) by V_{i+k}(Q)*V_1(Q) - V_{i+k-1}(Q).
   For U, we compute U_{i+k+1}(Q) by U_{i+k}(Q)*V_1(Q) - U_{i+k-1}(Q).
   The values of V_1(Q), V_{k-1}(Q) and V_k(Q) and V_k(Q) are in
   V1, Vk_1 and Vk, resp.
   The values of Vk_1 and Vk are clobbered. */
static void
scale_by_chebyshev (listz_t R, const listz_t F, const unsigned long len,
                    mpmod_t modulus, const mpres_t V1, mpres_t Vk_1,
                    mpres_t Vk)
{
  mpres_t Vt;
  unsigned long i;

  mpres_init (Vt, modulus);

  for (i = 0; i < len; i++)
    {
      mpres_mul_z_to_z (R[i], Vk, F[i], modulus);
      mpres_mul (Vt, Vk, V1, modulus);
      mpres_sub (Vt, Vt, Vk_1, modulus);
      mpres_set (Vk_1, Vk, modulus); /* Could be a swap */
      mpres_set (Vk, Vt, modulus); /* Could be a swap */
    }

  mpres_clear (Vt, modulus);
}


/* For a given reciprocal polynomial
   F(x) = f_0 + sum_{i=1}^{deg} f_i V_i(x+1/x),
   compute F(\gamma x)F(\gamma^{-1} x), with Q = \gamma + 1 / \gamma

   If NTT is used, needs 4 * deg + 3 entries in tmp.
   If no NTT is used, needs 4 * deg + 2 + (memory use of list_sqr_reciprocal)
*/

static void
list_scale_V (listz_t R, const listz_t F, const mpres_t Q,
              const unsigned long deg, mpmod_t modulus, listz_t tmp,
              const unsigned long tmplen,
	      mpzspv_t dct, const mpzspm_t ntt_context)
{
  mpres_t Vt;
  unsigned long i;
  const listz_t G = tmp, H = tmp + 2 * deg + 1, newtmp = tmp + 4 * deg + 2;
  const unsigned long newtmplen = tmplen - 4 * deg - 2;
#ifdef WANT_ASSERT
  mpz_t leading;
#endif

  if (deg == 0)
    {
      ASSERT(tmplen >= 1);
      mpz_mul (tmp[0], F[0], F[0]);
      mpz_mod (R[0], tmp[0], modulus->orig_modulus);
      return;
    }

  /* Make sure newtmplen does not underflow */
  ASSERT_ALWAYS (tmplen >= 4 * deg + 2);
#ifdef WANT_ASSERT
  mpz_init (leading);
  mpz_mul (leading, F[deg], F[deg]);
  mpz_mod (leading, leading, modulus->orig_modulus);
#endif

  /* Generate V_1(Q)/2 ... V_{deg}(Q)/2, multiply by f_i to form coefficients
     of G(x). Square the symmetric G(x) polynomial. */

  outputf (OUTPUT_TRACE, "list_scale_V: Q=%Zd, deg = %lu\n", Q, deg);
  list_output_poly (F, deg + 1, 0, 1, "/* list_scale_V */ F(x) = ", "\n",
		    OUTPUT_TRACE);

  /* Compute G[i] = V_i(Q)/2 * F[i] for i = 0, ..., deg.
     For i=0, V_0(Q) = 2, so G[0] = F[0],
     which leaves deg entries to process */

  mpz_set (G[0], F[0]);

#if defined(_OPENMP)
#pragma omp parallel if (deg > 1000)
#endif
  {
    const int nr_chunks = omp_get_num_threads();
    const int thread_nr = omp_get_thread_num();
    mpmod_t modulus_local;
    unsigned long l, start_i;
    mpres_t Vi, Vi_1;

    l = (deg - 1) / nr_chunks + 1; /* l = ceil (deg / nr_chunks) */
    start_i = thread_nr * l + 1;
    if (start_i <= deg + 1)
      l = MIN(l, deg + 1 - start_i);
    else
      l = 0;

    mpmod_init_set (modulus_local, modulus);
    mpres_init (Vi_1, modulus_local);
    mpres_init (Vi, modulus_local);

    V (Vi, Q, start_i, modulus_local);
    mpres_div_2exp (Vi, Vi, 1, modulus_local);
    V (Vi_1, Q, start_i - 1UL, modulus_local);
    mpres_div_2exp (Vi_1, Vi_1, 1, modulus_local);
    scale_by_chebyshev (G + start_i, F + start_i, l, modulus_local,
                        Q, Vi_1, Vi);

    mpres_clear (Vi_1, modulus_local);
    mpres_clear (Vi, modulus_local);
    mpmod_clear (modulus_local);
  }


  list_output_poly (G, deg + 1, 0, 1, "/* list_scale_V */ G(x) = ", "\n",
		    OUTPUT_TRACE);

  /* Now square the G polynomial in G[0 .. deg], put result in
     G[0 .. 2*deg] */

  /* Bugfix: ks_multiply() does not like negative coefficients. FIXME */

  for (i = 0; i <= deg; i++)
    if (mpz_sgn (G[i]) < 0)
      {
	mpz_add (G[i], G[i], modulus->orig_modulus);
	/* FIXME: make sure the absolute size does not "run away" */
	if (mpz_sgn (G[i]) < 0)
	  {
	    outputf (OUTPUT_ERROR, "list_scale_V: G[%lu] still negative\n", i);
	    mpz_mod (G[i], G[i], modulus->orig_modulus);
	  }
      }

  if (dct != NULL && ntt_context != NULL)
    ntt_sqr_reciprocal (G, G, dct, deg + 1, ntt_context);
  else
    list_sqr_reciprocal (G, G, deg + 1, modulus->orig_modulus,
                         newtmp, newtmplen);

  list_output_poly (G, 2 * deg + 1, 0, 1, "/* list_scale_V */ G(x)^2 == ",
		    "\n", OUTPUT_TRACE);

  /* Compute H[i-1] = U_i(Q)/2 * F[i] for i = 1, ..., deg */

#if defined(_OPENMP)
#pragma omp parallel if (deg > 1000)
#endif
  {
    const int nr_chunks = omp_get_num_threads();
    const int thread_nr = omp_get_thread_num();
    mpmod_t modulus_local;
    unsigned long l, start_i;
    mpres_t Ui, Ui_1;

    l = (deg - 1) / nr_chunks + 1; /* l = ceil(deg / nr_chunks) */
    start_i = thread_nr * l + 1UL;
    if (start_i <= deg + 1)
      l = MIN(l, deg + 1 - start_i);
    else
      l = 0;

    mpmod_init_set (modulus_local, modulus);
    mpres_init (Ui_1, modulus_local);
    mpres_init (Ui, modulus_local);

    U (Ui_1, Ui, Q, start_i - 1, modulus_local);
    mpres_div_2exp (Ui, Ui, 1, modulus_local);
    mpres_div_2exp (Ui_1, Ui_1, 1, modulus_local);

    scale_by_chebyshev (H - 1 + start_i, F + start_i, l, modulus_local,
                        Q, Ui_1, Ui);

    mpres_clear (Ui_1, modulus_local);
    mpres_clear (Ui, modulus_local);
    mpmod_clear (modulus_local);
  }


  /* Convert H to standard basis */
  /* We can do it in-place with H - 1 = H_U. */

  for (i = deg; i >= 3; i--)
    {
      mpz_add (H[i - 3], H[i - 3], H[i - 1]);
      if (mpz_cmp (H[i - 3], modulus->orig_modulus) >= 0)
        mpz_sub (H[i - 3], H[i - 3], modulus->orig_modulus);
    }

  /* U_2(X+1/X) = (X^2 - 1/X^2)/(X-1/X) = X+1/X = V_1(X+1/X),
     so no addition occures here */
  /* if (deg >= 2)
     mpz_set (H[1], H[1]); Again, a no-op. */

  /* U_1(X+1/X) = 1, so this goes to coefficient of index 0 in std. basis */
  /* mpz_set (H[0], H[0]); Another no-op. */

  /* Now H[0 ... deg-1] contains the deg coefficients in standard basis
     of symmetric H(X) of degree 2*deg-2. */

  list_output_poly (H, deg, 0, 1, "/* list_scale_V */ H(x) = ", "\n",
		    OUTPUT_TRACE);

  /* Square the symmetric H polynomial of degree 2*deg-2 (i.e. with deg
     coefficents in standard basis in H[0 ... deg-1]) */

  /* Bugfix: ks_multiply() does not like negative coefficients. */

  for (i = 0; i <= deg; i++)
    if (mpz_sgn (H[i]) < 0)
      {
	mpz_add (H[i], H[i], modulus->orig_modulus);
	if (mpz_sgn (H[i]) < 0)
	  {
	    outputf (OUTPUT_ERROR, "list_scale_V: H[%lu] still negative\n", i);
	    mpz_mod (H[i], H[i], modulus->orig_modulus);
	  }
      }

  if (dct != NULL && ntt_context != NULL)
    ntt_sqr_reciprocal (H, H, dct, deg, ntt_context);
  else
    list_sqr_reciprocal (H, H, deg, modulus->orig_modulus,
  		         newtmp, newtmplen);

  /* Now there are the 2*deg-1 coefficients in standard basis of a
     symmetric polynomial of degree 4*deg - 4 in H[0 ... 2*deg-2] */

  list_output_poly (H, 2*deg - 1, 0, 1, "/* list_scale_V */ H(x)^2 == ", "\n",
		    OUTPUT_TRACE);

  /* Multiply by Q^2-4 */
  mpres_init (Vt, modulus);
  mpres_sqr (Vt, Q, modulus);
  mpres_sub_ui (Vt, Vt, 4, modulus);

#if defined(_OPENMP)
#pragma omp parallel if (deg > 1000)
  {
    mpmod_t modulus_local;
    long i; /* OpenMP insists on signed loop iteration var :( */

    mpmod_init_set (modulus_local, modulus);

#pragma omp for
    for (i = 0; (unsigned long) i <= 2 * deg - 2; i++)
      mpres_mul_z_to_z (H[i], Vt, H[i], modulus_local);
    mpmod_clear (modulus_local);
  }
#else
  for (i = 0; (unsigned long) i <= 2 * deg - 2; i++)
    mpres_mul_z_to_z (H[i], Vt, H[i], modulus);
#endif

  list_output_poly (H, 2 * deg - 1, 0, 1, "/* list_scale_V */ "
		    "H(x)^2*(Q^2-4) == ", "\n", OUTPUT_TRACE);


  /* Multiply by (X - 1/X)^2 = X^2 - 2 + 1/X^2 and subtract from G */
  ASSERT (newtmplen > 0UL);
  if (deg == 1)
    {
      /* H(X) has degree 2*deg-2 = 0, so H(X) = h_0
	 H(X) * (X - 1/X)^2 = -2 h_0 + h_0 V_2(Y)  */
      mpz_mul_2exp (newtmp[0], H[0], 1UL);
      mpz_add (G[0], G[0], newtmp[0]); /* G[0] -= -2*H[0] */
      mpz_sub (G[2], G[2], H[0]);
    }
  else if (deg == 2)
    {
      /* H(X) has degree 2*deg-2 = 2, , so
	 H(X) = h_0 + h_1 (X+1/X) + h_2 (X^2+1/X^2)

	 H(X) * (X - 1/X)^2 =
	 -2*(h_0 - h_2) - h_1 * V_1(Y) + (h_0 - 2*h_2) * V_2(Y) +
	 h_1 * V_3(Y) + h_2 * V_4(Y)
      */
      mpz_sub (newtmp[0], H[0], H[2]);          /* h_0 - h_2 */
      mpz_mul_2exp (newtmp[0], newtmp[0], 1UL); /* 2*(h_0 - h_2) */
      mpz_add (G[0], G[0], newtmp[0]);          /* G[0] -= -2*(h_0 - h_2) */

      mpz_add (G[1], G[1], H[1]);               /* G[1] -= -h_1 */
      mpz_sub (newtmp[0], newtmp[0], H[0]);     /* h_0 - 2*h_2 */
      mpz_sub (G[2], G[2], newtmp[0]);          /* G[2] -= h_0 - 2*h_2 */
      mpz_sub (G[3], G[3], H[1]);               /* G[3] -= h_1 */
      mpz_sub (G[4], G[4], H[2]);               /* G[3] -= h_2 */
    }
  else
    {
      /* Let H(X) = h_0 + \sum_{i=1}^{n} h_i V_i(Y), Y = X+1/X. Then
	 (x - 1/x)^2 H(X) =
	 -2(h_0 - h_2) +
	 (- h_1 + h_3) V_1(Y) +
	 \sum_{i=2}^{n-2} (h_{i-2} - 2h_i + h_{i+2}) V_i(Y) +
	 (h_{n-3} - 2h_{n-1}) V_{n-1}(Y) +
	 (h_{n-2} - 2h_n) V_n(Y) +
	 h_{n-1} V_{n+1}(Y) +
	 h_n V_{n+2}(Y)

	 In our case, n = 2 * deg - 2
      */
      mpz_sub (newtmp[0], H[0], H[2]);
      mpz_mul_2exp (newtmp[0], newtmp[0], 1UL); /* t[0] = 2*(h_0 - h_2) */
      mpz_add (G[0], G[0], newtmp[0]);          /* G[0] -= -2*(h_0 - h_2) */

      mpz_add (G[1], G[1], H[1]);
      mpz_sub (G[1], G[1], H[3]); /* G[1] -= -h_1 + h_3 */

      for (i = 2; i <= 2 * deg - 4; i++)
	{
	  mpz_mul_2exp (newtmp[0], H[i], 1);
	  mpz_sub (newtmp[0], newtmp[0], H[i - 2]);
	  mpz_sub (newtmp[0], newtmp[0], H[i + 2]); /* 2h_i-h_{i-2}-h_{i+2} */
	  mpz_add (G[i], G[i], newtmp[0]); /* G[i] -= -2h_i+h_{i-2}+h_{i+2} */
	}

      for ( ; i <= 2 * deg - 2; i++)
	{
	  mpz_mul_2exp (newtmp[0], H[i], 1UL);
	  mpz_sub (newtmp[0], H[i - 2], newtmp[0]); /* h_{n-3} - 2h_{n-1} */
	  mpz_sub (G[i], G[i], newtmp[0]);
	}

      mpz_sub (G[i], G[i], H[i - 2]);
      mpz_sub (G[i + 1], G[i + 1], H[i - 1]);
    }

  for (i = 0; i <= 2 * deg; i++)
    mpz_mod (R[i], G[i], modulus->orig_modulus);

  if (test_verbose (OUTPUT_TRACE))
    for (i = 0; i <= 2 * deg; i++)
      outputf (OUTPUT_TRACE, "list_scale_V: R[%lu] = %Zd\n", i, R[i]);

#ifdef WANT_ASSERT
  mpz_mod (R[2 * deg], R[2 * deg], modulus->orig_modulus);
  ASSERT (mpz_cmp (leading, R[2 * deg]) == 0);
  mpz_clear (leading);
#endif

  mpres_clear (Vt, modulus);
}


#ifdef WANT_ASSERT
/* Check if l is an (anti-)symmetric, possibly monic, polynomial.
   Returns -1 if it is (anti-)symmetric, or the smallest index i where
   l[i] != l[len - 1 + monic - i])
   If anti == 1, the list is checked for symmetry, if it is -1, for
   antisymmetry.
   This function is used only if assertions are enabled.
*/

static long int ATTRIBUTE_UNUSED
list_is_symmetric (listz_t l, unsigned long len, int monic, int anti,
		   mpz_t modulus, mpz_t tmp)
{
    unsigned long i;

    ASSERT (monic == 0 || monic == 1);
    ASSERT (anti == 1 || anti == -1);

    if (monic && anti == 1 && mpz_cmp_ui (l[0], 1) != 0)
	return 0L;

    if (monic && anti == -1)
      {
	mpz_sub_ui (tmp, modulus, 1);
	if (mpz_cmp (tmp, l[0]) != 0)
	  return 0L;
      }

    for (i = monic; i < len / 2; i++)
      {
	if (anti == -1)
	  {
	    /* Negate (mod modulus) */
	    if (mpz_sgn (l[i]) == 0)
	      {
		if (mpz_sgn (l[len - 1 + monic - i]) != 0)
		  return (long) i;
	      }
	    else
	      {
		mpz_sub (tmp, modulus, l[i]);
		if (mpz_cmp (tmp, l[len - 1 + monic - i]) != 0)
		  return (long) i;
	      }
	  }
	else if (mpz_cmp (l[i], l[len - 1 + monic - i]) != 0)
	    return (long) i;
      }

    return -1L;
}
#endif

#if 0 && defined(WANT_ASSERT)
/* Evaluate a polynomial of degree n-1 with all coefficients given in F[],
   or of degree n with an implicit leading 1 monomial not stored in F[],
   at x modulo modulus. Result goes in r. tmp needs 2 entries. */

static void
list_eval_poly (mpz_t r, const listz_t F, const mpz_t x,
		const unsigned long n, const int monic, const mpz_t modulus,
		listz_t tmp)
{
  unsigned long i;

  mpz_set_ui (tmp[0], 1UL);
  mpz_set_ui (r, 0UL);

  for (i = 0UL; i < n; i++)
    {
      /* tmp[0] = x^i */
      mpz_mul (tmp[1], F[i], tmp[0]);
      mpz_mod (tmp[1], tmp[1], modulus);
      mpz_add (r, r, tmp[1]);

      mpz_mul (tmp[1], tmp[0], x);
      mpz_mod (tmp[0], tmp[1], modulus);
    }

  if (monic)
    mpz_add (r, r, tmp[0]);

  mpz_mod (r, r, modulus);
}
#endif

/* return the memory needed in the F[] array for poly_from_sets_V */
static unsigned long
mem_poly_from_sets_V (sets_long_t *sets)
{
  unsigned long c, deg, i, nr;
  set_long_t *set = sets->sets;
  unsigned long mem, maxmem = 0;

  deg = 1UL;
  for (nr = sets->nr - 1UL; nr > 0UL; nr--)
    {
      set = sets_nextset (sets->sets); /* Skip first set */
      for (i = 1UL; i < nr; i++) /* Skip over remaining sets but one */
        set = sets_nextset (set);
      c = set->card;
      if (c == 2UL)
        deg *= 2UL;
      else
        {
          i = (c - 1UL) / 2UL - 1; /* maximal value of i */
          /* list_scale_V needs 2*deg+1 entries starting at
             F + (2UL * i + 1UL) * (deg + 1UL) */
          mem = (2UL * i + 1UL) * (deg + 1UL) + (2 * deg + 1UL);
          if (mem > maxmem)
            maxmem = mem;
	  deg *= c;
	}
    }

  return maxmem;
}

/* Build a polynomial with roots r^2i, i in the sumset of the sets in "sets".
   The parameter Q = r + 1/r. This code uses the fact that the polynomials
   are symmetric. Requires that the first set in "sets" has cardinality 2,
   all sets must be symmetric around 0. The resulting polynomial of degree
   2*d is F(x) = f_0 + \sum_{1 <= i <= d} f_i (x^i + 1/x^i). The coefficient
   f_i is stored in F[i], which therefore needs d+1 elements. */

static unsigned long
poly_from_sets_V (listz_t F, const mpres_t Q, sets_long_t *sets,
		  listz_t tmp, const unsigned long tmplen, mpmod_t modulus,
		  mpzspv_t dct, const mpzspm_t ntt_context)
{
  unsigned long c, deg, i, nr;
  set_long_t *set = sets->sets;
  mpres_t Qt;

  ASSERT_ALWAYS (sets->nr > 0UL);
  ASSERT_ALWAYS (set->card == 2UL); /* Check that the cardinality of
                                       first set is 2 */
  /* Check that first set is symmetric around 0 (we write card-1
     instead of 1 to avoid a compiler warning with clang 2.9) */
  ASSERT_ALWAYS (set->elem[0] == -set->elem[set->card - 1]);

  if (test_verbose (OUTPUT_TRACE))
    {
      mpz_t t;
      mpz_init (t);
      mpres_get_z (t, Q, modulus);
      outputf (OUTPUT_TRACE, "poly_from_sets_V (F, Q = %Zd, sets)\n", t);
      mpz_clear (t);
    }

  mpres_init (Qt, modulus);

  outputf (OUTPUT_DEVVERBOSE, " (processing set of size 2");

  V (Qt, Q, set->elem[0], modulus); /* First set in sets is {-k, k} */
  V (Qt, Qt, 2UL, modulus);         /* Qt = V_2k(Q) */

  mpres_neg (Qt, Qt, modulus);
  mpres_get_z (F[0], Qt, modulus);
  mpz_set_ui (F[1], 1UL);
  deg = 1UL;
  /* Here, F(x) = (x - r^{2k_1})(x - r^{-2k_1}) / x =
                  (x^2 - x (r^{2k_1} + r^{-2k_1}) + 1) / x =
		  (x + 1/x) - V_{2k_1}(r + 1/r) */

  for (nr = sets->nr - 1UL; nr > 0UL; nr--)
    {
      /* Assuming the sets are sorted in order of ascending cardinality,
         we process them back-to-front so the sets of cardinality 2 are
         processed last, but skipping the first set which we processed
         already. */

      set = sets_nextset (sets->sets); /* Skip first set */
      for (i = 1UL; i < nr; i++) /* Skip over remaining sets but one */
        set = sets_nextset (set);

      /* Process this set. We assume it is either of cardinality 2, or of
	 odd cardinality */
      c = set->card;
      outputf (OUTPUT_DEVVERBOSE, " %lu", c);

      if (c == 2UL)
	{
	  /* Check it's symmetric (we write c-1 instead of 2 to avoid a
           compiler warning with clang 2.9) */
	  ASSERT_ALWAYS (set->elem[0] == -set->elem[c - 1]);
	  V (Qt, Q, set->elem[0], modulus);
	  V (Qt, Qt, 2UL, modulus);
	  list_scale_V (F, F, Qt, deg, modulus, tmp, tmplen, dct,
	                ntt_context);
	  deg *= 2UL;
	  ASSERT_ALWAYS (mpz_cmp_ui (F[deg], 1UL) == 0); /* Check it's monic */
	}
      else
	{
	  ASSERT_ALWAYS (c % 2UL == 1UL);
	  ASSERT_ALWAYS (set->elem[(c - 1UL) / 2UL] == 0UL);
	  /* Generate the F(Q^{2k_i} * X)*F(Q^{-2k_i} * X) polynomials.
	     Each is symmetric of degree 2*deg, so each has deg+1 coeffients
	     in standard basis. */
	  for (i = 0UL; i < (c - 1UL) / 2UL; i++)
	    {
              /* Check it's symmetric */
	      ASSERT_ALWAYS (set->elem[i] == -set->elem[c - 1L - i]);
	      V (Qt, Q, set->elem[i], modulus);
	      V (Qt, Qt, 2UL, modulus);
	      ASSERT (mpz_cmp_ui (F[deg], 1UL) == 0); /* Check it's monic */
	      list_scale_V (F + (2UL * i + 1UL) * (deg + 1UL), F, Qt, deg,
	                    modulus, tmp, tmplen, dct, ntt_context);
	      ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * (deg + 1UL) + 2UL * deg],
	              1UL) == 0); /* Check it's monic */
	    }
	  /* Multiply the polynomials */
	  for (i = 0UL; i < (c - 1UL) / 2UL; i++)
	    {
	      /* So far, we have the product
		 F(X) * F(Q^{2k_j} * X) * F(Q^{-2k_j} * X), 1 <= j <= i,
		 at F. This product has degree 2 * deg + i * 4 * deg, that is
		 (2 * i + 1) * 2 * deg, which means (2 * i + 1) * deg + 1
		 coefficients in F[0 ... (i * 2 + 1) * deg]. */
	      ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * deg], 1UL) == 0);
	      ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * (deg + 1UL) + 2UL*deg],
	                          1UL) == 0);
	      list_output_poly (F, (2UL * i + 1UL) * deg + 1, 0, 1,
				"poly_from_sets_V: Multiplying ", "\n",
				OUTPUT_TRACE);
	      list_output_poly (F + (2UL * i + 1UL) * (deg + 1UL),
	                        2UL * deg + 1UL, 0, 1, " and ", "\n",
	                        OUTPUT_TRACE);
	      list_mul_reciprocal (F,
		                   F, (2UL * i + 1UL) * deg + 1UL,
			 	   F + (2UL * i + 1UL) * (deg + 1UL),
				   2UL * deg + 1UL, modulus->orig_modulus,
				   tmp, tmplen);
	      list_mod (F, F, (2UL * i + 3UL) * deg + 1UL,
	                modulus->orig_modulus);
	      list_output_poly (F, (2UL * i + 3UL) * deg + 1UL, 0, 1,
                                " = ", "\n", OUTPUT_TRACE);
	      ASSERT (mpz_cmp_ui (F[(2UL * i + 3UL) * deg], 1UL) == 0);
	    }
	  deg *= c;
	}
    }

  mpres_clear (Qt, modulus);
  outputf (OUTPUT_DEVVERBOSE, ")");

  return deg;
}

static int
build_F_ntt (listz_t F, const mpres_t P_1, sets_long_t *S_1,
	     const faststage2_param_t *params, mpmod_t modulus)
{
  mpzspm_t F_ntt_context;
  mpzspv_t F_ntt;
  unsigned long tmplen;
  listz_t tmp;
  long timestart, realstart;
  unsigned long i;

  timestart = cputime ();
  realstart = realtime ();

  /* Precompute the small primes, primitive roots and inverses etc. for
     the NTT. The code to multiply wants a 3*k-th root of unity, where
     k is the smallest power of 2 with k > s_1/2 */

  F_ntt_context = mpzspm_init (3UL << ceil_log2 (params->s_1 / 2 + 1),
			       modulus->orig_modulus);
  if (F_ntt_context == NULL)
    {
      outputf (OUTPUT_ERROR, "Could not initialise F_ntt_context, "
               "presumably out of memory\n");
      return ECM_ERROR;
    }

  print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for building F = ",
		    F_ntt_context);

  outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");

  tmplen = params->s_1 + 100;
  tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
  F_ntt = mpzspv_init (1UL << ceil_log2 (params->s_1 / 2 + 1), F_ntt_context);

  i = poly_from_sets_V (F, P_1, S_1, tmp, tmplen, modulus, F_ntt,
                        F_ntt_context);
  ASSERT_ALWAYS(2 * i == params->s_1);
  ASSERT_ALWAYS(mpz_cmp_ui (F[i], 1UL) == 0);

  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
  if (test_verbose (OUTPUT_TRACE))
    {
      for (i = 0; i < params->s_1 / 2 + 1; i++)
	outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
      outputf (OUTPUT_TRACE, "f(x) = f_0");
      for (i = 1; i < params->s_1 / 2 + 1; i++)
	outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
      outputf (OUTPUT_TRACE, "/* PARI */ \n");
    }

  clear_list (tmp, tmplen);
  tmp = NULL;
  mpzspv_clear (F_ntt, F_ntt_context);
  F_ntt = NULL;
  mpzspm_clear (F_ntt_context);
  F_ntt_context = NULL;

  return 0;
}

/* Compute g_i = x_0^{M-i} * r^{(M-i)^2} for 0 <= i < l.
   x_0 = b_1^{2*k_2 + (2*m_1 + 1) * P}. r = b_1^P.
   Stores the result in g[0 ... l] and/or in g_ntt[offset ... offset + l] */

static void
pm1_sequence_g (listz_t g_mpz, mpzspv_t g_ntt, const mpres_t b_1,
                const unsigned long P, const long M_param,
		const unsigned long l_param, const mpz_t m_1, const long k_2,
		mpmod_t modulus_param, const mpzspm_t ntt_context)
{
  mpres_t r[3], x_0, x_Mi;
  mpz_t t;
  unsigned long i;
  long timestart, realstart;
  long M = M_param;
  unsigned long l = l_param, offset = 0UL;
  mpmod_t modulus;
  int want_output = 1;

  outputf (OUTPUT_VERBOSE, "Computing g_i");
  outputf (OUTPUT_DEVVERBOSE, "\npm1_sequence_g: P = %lu, M_param = %lu, "
           "l_param = %lu, m_1 = %Zd, k_2 = %lu\n",
	   P, M_param, l_param, m_1, k_2);
  timestart = cputime ();
  realstart = realtime ();

#ifdef _OPENMP
#pragma omp parallel if (l > 100) private(r, x_0, x_Mi, t, i, M, l, offset, modulus, want_output)
  {
    /* When multi-threading, we adjust the parameters for each thread */

    const int nr_chunks = omp_get_num_threads();
    const int thread_nr = omp_get_thread_num();

    l = (l_param - 1) / nr_chunks + 1; /* = ceil(l_param / nr_chunks) */
    offset = thread_nr * l;
    if (offset <= l_param)
      l = MIN(l, l_param - offset);
    else
      l = 0;
    outputf (OUTPUT_DEVVERBOSE,
             "pm1_sequence_g: thread %d has l = %lu, offset = %lu.\n",
             thread_nr, l, offset);
    M = M_param - (long) offset;

    /* Let only the master thread print stuff */
    want_output = (thread_nr == 0);

    if (want_output)
      outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
#endif

  /* Make a private copy of the mpmod_t struct */
  mpmod_init_set (modulus, modulus_param);

  mpz_init (t);
  mpres_init (r[0], modulus);
  mpres_init (r[1], modulus);
  mpres_init (r[2], modulus);
  mpres_init (x_0, modulus);
  mpres_init (x_Mi, modulus);

  if (want_output)
    {
      if (test_verbose (OUTPUT_TRACE))
	{
	  mpres_get_z (t, b_1, modulus);
	  outputf (OUTPUT_TRACE, "\n/* pm1_sequence_g */ N = %Zd; "
		   "b_1 = Mod(%Zd, N); /* PARI */\n", modulus->orig_modulus, t);
	  outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ P = %lu; M = %ld; "
		   "m_1 = %Zd; /* PARI */\n", P, M, m_1);
	  outputf (OUTPUT_TRACE,
		   "/* pm1_sequence_g */ r = b_1^P; /* PARI */\n");
	  outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ x_0 = "
		   "b_1^(2*%ld + (2*m_1 + 1)*P); /* PARI */\n", k_2);
	}
    }

  /* We use (M-(i+1))^2 = (M-i)^2 + 2*(-M+i) + 1 */
  mpz_set_ui (t, P);
  mpres_pow (r[0], b_1, t, modulus);     /* r[0] = b_1^P = r */
  if (test_verbose (OUTPUT_TRACE))
    {
      mpres_get_z (t, r[0], modulus);
      outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ r == %Zd /* PARI C */\n", t);
    }

  /* FIXME: This is a huge mess, clean up some time */

  mpz_set_si (t, M);
  mpz_neg (t, t);
  mpz_mul_2exp (t, t, 1UL);
  mpz_add_ui (t, t, 1UL);
  /* Warning: t might be negative here. */
  mpres_pow (r[1], r[0], t, modulus);    /* r[1] = r^{2(-M+i)+1}, i = 0 */
  mpz_set_si (t, M);
  mpz_mul (t, t, t);                     /* t = M^2 */
  mpres_pow (r[2], r[0], t, modulus);    /* r[2] = r^{(M-i)^2}, i = 0 */
  mpres_sqr (r[0], r[0], modulus); /* r[0] = r^2 */

  mpz_mul_2exp (t, m_1, 1UL);
  mpz_add_ui (t, t, 1UL);
  mpz_mul_ui (t, t, P);
  mpz_add_si (t, t, k_2);
  mpz_add_si (t, t, k_2);
  if (want_output)
    outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ 2*%ld + (2*%Zd + 1)*P == "
	     "%Zd /* PARI C */\n", k_2, m_1, t);

  /* Warning: t might be negative here. */
  mpres_pow (x_0, b_1, t, modulus);  /* x_0 = b_1^{2*k_2 + (2*m_1 + 1)*P} */
  if (want_output && test_verbose (OUTPUT_TRACE))
    {
      mpres_get_z (t, x_0, modulus);
      outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ x_0 == %Zd /* PARI C */\n",
	       t);
    }

  mpz_set_si (t, M);
  mpres_pow (x_Mi, x_0, t, modulus); /* x_Mi = x_0^{M-i}, i = 0 */

  mpres_invert (x_0, x_0, modulus);  /* x_0 := x_0^{-1} now */
  mpres_mul (r[1], r[1], x_0, modulus); /* r[1] = x_0^{-1} * r^{-2M+1} */

  mpres_mul (r[2], r[2], x_Mi, modulus); /* r[2] = x_0^M * r^{M^2} */
  mpres_get_z (t, r[2], modulus);
  outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd; /* PARI */\n",
	   offset, t);
  if (l > 0)
    {
      if (g_mpz != NULL)
        mpz_set (g_mpz[offset], t);
      if (g_ntt != NULL)
        mpzspv_from_mpzv (g_ntt, offset, &t, 1UL, ntt_context);
    }

  /* So here we have for i = 0
     r[2] = x_0^(M-i) * r^{(M-i)^2}
     r[1] = x_0^{-1} * r^{2(-M+i)+1}
     r[0] = r^2
     t = r[2]
  */

  for (i = 1; i < l; i++)
    {
      if (g_mpz != NULL)
        {
	  mpres_mul_z_to_z (g_mpz[offset + i], r[1], g_mpz[offset + i - 1],
			    modulus);
	  outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd;"
		   " /* PARI */\n", offset + i, g_mpz[offset + i]);
        }
      if (g_ntt != NULL)
      {
	  mpres_mul_z_to_z (t, r[1], t, modulus);
	  if (g_mpz == NULL) /* Only one should be non-NULL... */
	      outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd;"
		       " /* PARI */\n", offset + i, t);
	  mpzspv_from_mpzv (g_ntt, offset + i, &t, 1UL, ntt_context);
      }
      mpres_mul (r[1], r[1], r[0], modulus);
    }

  mpres_clear (r[0], modulus);
  mpres_clear (r[1], modulus);
  mpres_clear (r[2], modulus);
  mpres_clear (x_0, modulus);
  mpres_clear (x_Mi, modulus);
  mpz_clear (t);
  mpmod_clear (modulus); /* Clear our private copy of modulus */

#ifdef _OPENMP
  }
#endif

  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);

  if (test_verbose (OUTPUT_TRACE))
    {
      for (i = 0; i < l_param; i++)
	{
	  outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu == x_0^"
		   "(M - %lu) * r^((M - %lu)^2) /* PARI C */\n", i, i, i);
	}
      outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g(x) = g_0");
      for (i = 1; i < l; i++)
	outputf (OUTPUT_TRACE, " + g_%lu * x^%lu", i, i);
      outputf (OUTPUT_TRACE, " /* PARI */\n");
    }
}


/* Compute h_j = r^(-j^2) * f_j for 0 <= j < d as described in section 9
   of the paper. h == f is ok. */

static void
pm1_sequence_h (listz_t h, mpzspv_t h_ntt, mpz_t *f, const mpres_t r,
		const unsigned long d, mpmod_t modulus_parm,
		const mpzspm_t ntt_context)
{
  mpres_t invr;  /* r^{-1}. Can be shared between threads */
  long timestart, realstart;

  mpres_init (invr, modulus_parm);
  mpres_invert (invr, r, modulus_parm); /* invr = r^{-1}. FIXME: test for
					   failure, even if theoretically
					   impossible */

  if (test_verbose (OUTPUT_TRACE))
    {
      mpz_t t;
      mpz_init (t);
      mpres_get_z (t, r, modulus_parm);
      outputf (OUTPUT_TRACE, "\n/* pm1_sequence_h */ N = %Zd; "
	       "r = Mod(%Zd, N); /* PARI */\n",
	       modulus_parm->orig_modulus, t);
      mpz_clear (t);
    }

  outputf (OUTPUT_VERBOSE, "Computing h");
  timestart = cputime ();
  realstart = realtime ();

#ifdef _OPENMP
#pragma omp parallel if (d > 100)
#endif
  {
    mpres_t fd[3]; /* finite differences table for r^{-i^2}*/
    mpz_t t;       /* the h_j value as an mpz_t */
    unsigned long j;
    unsigned long offset = 0UL, len = d;
    mpmod_t modulus;

    /* Adjust offset and length for this thread */
#ifdef _OPENMP
    {
      const int nr_chunks = omp_get_num_threads();
      const int thread_nr = omp_get_thread_num();
      unsigned long chunklen;

      if (thread_nr == 0)
	outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);

      chunklen = (len - 1UL) / (unsigned long) nr_chunks + 1UL;
      offset = chunklen * (unsigned long) thread_nr;
      if (offset <= len)
        len = MIN(chunklen, len - offset);
      else
        len = 0;
    }
#endif

    mpmod_init_set (modulus, modulus_parm);
    mpres_init (fd[0], modulus);
    mpres_init (fd[1], modulus);
    mpres_init (fd[2], modulus);
    mpz_init (t);

    /* We have (n + 1)^2 = n^2 + 2n + 1. For the finite differences we'll
       need r^{-2}, r^{-(2n+1)}, r^{-n^2}. Init for n = 0. */

    /* r^{-2} in fd[0] is constant and could be shared. Computing it
       separately in each thread has the advantage of putting it in
       local memory. May not make much difference overall */

    mpres_sqr (fd[0], invr, modulus);       /* fd[0] = r^{-2} */
    mpz_set_ui (t, offset);
    mpz_mul_2exp (t, t, 1UL);
    mpz_add_ui (t, t, 1UL);                 /* t = 2 * offset + 1 */
    mpres_pow (fd[1], invr, t, modulus);    /* fd[1] = r^{-(2*offset+1)} */
    mpz_set_ui (t, offset);
    mpz_mul (t, t, t);                      /* t = offset^2 */
    mpres_pow (fd[2], invr, t, modulus);    /* fd[2] = r^{-offset^2} */

    /* Generate the sequence */
    for (j = offset; j < offset + len; j++)
      {
	mpres_mul_z_to_z (t, fd[2], f[j], modulus);
	outputf (OUTPUT_TRACE,
		 "/* pm1_sequence_h */ h_%lu = %Zd; /* PARI */\n", j, t);

	if (h != NULL)
	  mpz_set (h[j], t);
	if (h_ntt != NULL)
	  mpzspv_from_mpzv (h_ntt, j, &t, 1UL, ntt_context);

	mpres_mul (fd[2], fd[2], fd[1], modulus); /* fd[2] = r^{-j^2} */
	mpres_mul (fd[1], fd[1], fd[0], modulus); /* fd[1] = r^{-2*j-1} */
      }

    mpres_clear (fd[2], modulus);
    mpres_clear (fd[1], modulus);
    mpres_clear (fd[0], modulus);
    mpz_clear (t);
    mpmod_clear (modulus);
  }

  mpres_clear (invr, modulus_parm);

  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);

  if (test_verbose (OUTPUT_TRACE))
    {
      unsigned long j;
      for (j = 0; j < d; j++)
	outputf (OUTPUT_TRACE, "/* pm1_sequence_h */ h_%lu == "
		   "f_%lu * r^(-%lu^2) /* PARI C */\n", j, j, j);
      outputf (OUTPUT_TRACE, "/* pm1_sequence_h */ h(x) = h_0");
      for (j = 1; j < d; j++)
        outputf (OUTPUT_TRACE, " + h_%lu * (x^%lu + x^(-%lu))", j, j, j);
      outputf (OUTPUT_TRACE, " /* PARI */\n");
    }
}


static int
make_S_1_S_2 (sets_long_t **S_1, set_long_t **S_2,
              const faststage2_param_t *params)
{
  unsigned long i;
  sets_long_t *facS_2;
  size_t facS_2_size;

  *S_1 = sets_get_factored_sorted (params->P);
  if (*S_1 == NULL)
    return ECM_ERROR;

  {
    mpz_t t1, t2;

    mpz_init (t1);
    mpz_init (t2);
    sets_sumset_minmax (t1, *S_1, 1);
    sets_max (t2, params->P);
    ASSERT_ALWAYS (mpz_cmp (t1, t2) == 0);
    mpz_clear (t1);
    mpz_clear (t2);
  }

  *S_2 = malloc (set_sizeof(params->s_2));
  if (*S_2 == NULL)
    {
      free (*S_1);
      return ECM_ERROR;
    }

  /* Extract sets for S_2 and compute the set of sums */

  sets_extract (NULL, &facS_2_size, *S_1, params->s_2);
  facS_2 = malloc (facS_2_size);
  if (facS_2 == NULL)
    {
      free (*S_1);
      free (*S_2);
      return ECM_ERROR;
    }
  sets_extract (facS_2, NULL, *S_1, params->s_2);
  sets_sumset (*S_2, facS_2);
  ASSERT_ALWAYS ((*S_2)->card == params->s_2);
  free (facS_2);
  quicksort_long ((*S_2)->elem, (*S_2)->card);

  /* Print the sets in devverbose mode */
  if (test_verbose (OUTPUT_DEVVERBOSE))
    {
      outputf (OUTPUT_DEVVERBOSE, "S_1 = ");
      sets_print (OUTPUT_DEVVERBOSE, *S_1);

      outputf (OUTPUT_DEVVERBOSE, "S_2 = {");
      for (i = 0UL; i + 1UL < params->s_2; i++)
	outputf (OUTPUT_DEVVERBOSE, "%ld, ", (*S_2)->elem[i]);
      if (i < params->s_2)
	outputf (OUTPUT_DEVVERBOSE, "%ld", (*S_2)->elem[i]);
      outputf (OUTPUT_DEVVERBOSE, "}\n");
    }

  return 0;
}

#if 0
static mpzspv_t *
mpzspv_init_mt (spv_size_t len, mpzspm_t mpzspm)
{
  int i; /* OpenMP wants the iteration variable a signed type */
  mpzspv_t *x = (mpzspv_t *) malloc (mpzspm->sp_num * sizeof (spv_t *));

  if (x == NULL)
    return NULL;

  for (i = 0; i < (int) mpzspm->sp_num; i++)
    x[i] = NULL;

#ifdef _OPENMP
#pragma omp parallel private(i) shared(x)
  {
#pragma omp for
#endif
    for (i = 0; i < (int) mpzspm->sp_num; i++)
      x[i] = (spv_t *) sp_aligned_malloc (len * sizeof (sp_t));

#ifdef _OPENMP
  }
#endif

  for (i = 0; i < (int) mpzspm->sp_num; i++)
    if (x[i] == NULL)
      break;

  if (i != (int) mpzspm->sp_num) /* There is a NULL pointer */
    {
      for (i = 0; i < (int) mpzspm->sp_num; i++)
	if (x[i] != NULL)
	  sp_aligned_free(x[i]);
      return NULL;
    }

#if 0
  if (test_verbose (OUTPUT_DEVVERBOSE))
    {
      spv_t * last = x[0];
      printf ("mpzspv_init_mt: x[0] = %p\n", x[0]);
      for (i = 1; i < (int) mpzspm->sp_num; i++)
        printf ("mpzspv_init_mt: x[%d] = %p, distance = %ld\n",
                i, x[i], (long) (x[i] - x[i-1]));
    }
#endif

  return x;
}
#endif

/* Square the reciprocal Laurent polynomial S(x) of degree 2*n-2.
   S(x) = s_0 + \sum_{i=1}^{n-1} s_i (x^i + x^{-1}).
   S[i] contains the n coefficients s_i, 0 <= i <= n-1.
   R[i] will contain the 2n-1 coefficients r_i, 0 <= i <= 2*n-2, where
   R(x) = S(x)^2 = r_0 + \sum_{i=1}^{2n-2} r_i (x^i + x^{-1}).
   dft must have power of 2 length len >= 2n.
   The NTT primes must be == 1 (mod 3*len).
*/

#undef TRACE_ntt_sqr_reciprocal
static void
ntt_sqr_reciprocal (mpzv_t R, const mpzv_t S, mpzspv_t dft,
		    const spv_size_t n, const mpzspm_t ntt_context)
{
#ifdef WANT_ASSERT
  mpz_t S_eval_1, R_eval_1;
#endif

  if (n == 0)
    return;

  if (n == 1)
    {
      mpz_mul (R[0], S[0], S[0]);
      mpz_mod (R[0], R[0], ntt_context->modulus);
      return;
    }

#ifdef WANT_ASSERT
  mpz_init (S_eval_1);
  list_recip_eval1 (S_eval_1, S, n);
  /* Compute (S(1))^2 */
  mpz_mul (S_eval_1, S_eval_1, S_eval_1);
  mpz_mod (S_eval_1, S_eval_1, ntt_context->modulus);
#endif

#ifdef TRACE_ntt_sqr_reciprocal
  printf ("ntt_sqr_reciprocal: n %lu, length %lu\n", n, len);
  gmp_printf ("Input polynomial is %Zd", S[0]);
  {
    int j;
    for (j = 1; (spv_size_t) j < n; j++)
      gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", S[j], j, j);
  }
  printf ("\n");
#endif

  /* Fill NTT elements [0 .. n-1] with coefficients */
  mpzspv_from_mpzv (dft, (spv_size_t) 0, S, n, ntt_context);
  mpzspv_sqr_reciprocal (dft, n, ntt_context);

#if defined(_OPENMP)
#pragma omp parallel if (n > 50)
#endif
  {
    spv_size_t i, offset = 0, chunklen = 2*n - 1;

#if defined(_OPENMP)
    {
      const int nr_chunks = omp_get_num_threads();
      const int thread_nr = omp_get_thread_num();

      chunklen = (chunklen - 1) / (spv_size_t) nr_chunks + 1;
      offset = (spv_size_t) thread_nr * chunklen;
      if (offset <= 2*n - 1)
        chunklen = MIN(chunklen, (2*n - 1) - offset);
      else
        chunklen = 0UL;
    }
#endif

    mpzspv_to_mpzv (dft, offset, R + offset, chunklen, ntt_context);
    for (i = offset; i < offset + chunklen; i++)
      mpz_mod (R[i], R[i], ntt_context->modulus);
  }

#ifdef TRACE_ntt_sqr_reciprocal
  gmp_printf ("ntt_sqr_reciprocal: Output polynomial is %Zd", R[0]);
  for (j = 1; (spv_size_t) j < 2*n - 1; j++)
    gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", R[j], j, j);
  printf ("\n");
#endif

#ifdef WANT_ASSERT
  mpz_init (R_eval_1);
  /* Compute (S^2)(1) and compare to (S(1))^2 */
  list_recip_eval1 (R_eval_1, R, 2 * n - 1);
  mpz_mod (R_eval_1, R_eval_1, ntt_context->modulus);
  if (mpz_cmp (R_eval_1, S_eval_1) != 0)
    {
      gmp_fprintf (stderr, "ntt_sqr_reciprocal: (S(1))^2 = %Zd but "
		   "(S^2)(1) = %Zd\n", S_eval_1, R_eval_1);
#if 0
      gmp_printf ("Output polynomial is %Zd", R[0]);
      for (j = 1; (spv_size_t) j < 2*n - 1; j++)
	gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", R[j], j, j);
      printf ("\n");
#endif
      abort ();
    }
  mpz_clear (S_eval_1);
  mpz_clear (R_eval_1);
#endif
}


/* Computes gcd(\prod_{0 <= i < len} (ntt[i + offset] + add[i]), N),
   the NTT residues are converted to integer residues (mod N) first.
   If add == NULL, add[i] is assumed to be 0. */

static void
ntt_gcd (mpz_t f, mpz_t *product, mpzspv_t ntt, const unsigned long ntt_offset,
	 const listz_t add, const unsigned long len_param,
	 const mpzspm_t ntt_context, mpmod_t modulus_param)
{
  unsigned long i, j;
  const unsigned long Rlen = MPZSPV_NORMALISE_STRIDE;
  listz_t R;
  unsigned long len = len_param, thread_offset = 0;
  mpres_t tmpres, tmpprod, totalprod;
  mpmod_t modulus;
  long timestart, realstart;

  outputf (OUTPUT_VERBOSE, "Computing gcd of coefficients and N");
  timestart = cputime ();
  realstart = realtime ();

  /* All the threads will multiply their partial products to this one. */
  mpres_init (totalprod, modulus_param);
  mpres_set_ui (totalprod, 1UL, modulus_param);

#ifdef _OPENMP
#pragma omp parallel if (len > 100) private(i, j, R, len, thread_offset, tmpres, tmpprod, modulus) shared(totalprod)
  {
    const int nr_chunks = omp_get_num_threads();
    const int thread_nr = omp_get_thread_num();

    len = (len_param - 1) / nr_chunks + 1;
    thread_offset = thread_nr * len;
    if (thread_offset <= len_param)
      len = MIN(len, len_param - thread_offset);
    else
      len = 0;
#pragma omp master
    {
      outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
    }
#endif

    /* Make a private copy of the mpmod_t struct */
    mpmod_init_set (modulus, modulus_param);

    R = init_list2 (Rlen, (mpz_size (modulus->orig_modulus) + 2) *
                           GMP_NUMB_BITS);
    mpres_init (tmpres, modulus);
    mpres_init (tmpprod, modulus);
    mpres_set_ui (tmpprod, 1UL, modulus);

    for (i = 0; i < len; i += Rlen)
      {
	const unsigned long blocklen = MIN(len - i, Rlen);

	/* Convert blocklen residues from NTT to integer representatives
	   and store them in R */
	mpzspv_to_mpzv (ntt, ntt_offset + thread_offset + i, R, blocklen,
			ntt_context);

	/* Accumulate product in tmpprod */
	for (j = 0; j < blocklen; j++)
	  {
	    outputf (OUTPUT_TRACE, "r_%lu = %Zd; /* PARI */\n", i, R[j]);
	    if (add != NULL)
	      mpz_add (R[j], R[j], add[i + thread_offset + j]);
	    mpres_set_z_for_gcd (tmpres, R[j], modulus);
#define TEST_ZERO_RESULT
#ifdef TEST_ZERO_RESULT
	    if (mpres_is_zero (tmpres, modulus))
	      outputf (OUTPUT_VERBOSE, "R_[%lu] = 0\n", i);
#endif
	    mpres_mul (tmpprod, tmpprod, tmpres, modulus);
	  }
      }
#ifdef _OPENMP
#pragma omp critical
    {
      mpres_mul (totalprod, totalprod, tmpprod, modulus);
    }
#else
    mpres_set (totalprod, tmpprod, modulus);
#endif
    mpres_clear (tmpres, modulus);
    mpres_clear (tmpprod, modulus);
    mpmod_clear (modulus);
    clear_list (R, Rlen);
#ifdef _OPENMP
  }
#endif

  {
    mpz_t n;
    mpz_init (n);
    mpz_set_ui (n, len_param);
    mpres_set_z_for_gcd_fix (totalprod, totalprod, n, modulus_param);
    mpz_clear (n);
  }

  if (product != NULL)
    mpres_get_z (*product, totalprod, modulus_param);

  mpres_gcd (f, totalprod, modulus_param);
  mpres_clear (totalprod, modulus_param);

  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
}


int
pm1fs2 (mpz_t f, const mpres_t X, mpmod_t modulus,
	const faststage2_param_t *params)
{
  unsigned long phiP, nr;
  unsigned long i, l, lenF, lenG, lenR, tmplen;
  sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
                       progressions of prime length */
  set_long_t *S_2; /* This is stored as a regular set */
  listz_t F;   /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
		  degree s_1. It is symmetric, so has only s_1 / 2 + 1
		  distinct coefficients. The sequence h_j will be stored in
		  the same memory and won't be a monic polynomial, so the
		  leading 1 monomial of F will be stored explicitly. Hence we
		  need s_1 / 2 + 1 entries. */
  listz_t g, h, tmp, R;
  mpz_t mt;   /* All-purpose temp mpz_t */
  mpres_t mr; /* All-purpose temp mpres_t */
  int youpi = ECM_NO_FACTOR_FOUND;
  long timetotalstart, realtotalstart, timestart;

  timetotalstart = cputime ();
  realtotalstart = realtime ();

  phiP = eulerphi (params->P);
  ASSERT_ALWAYS (phiP == params->s_1 * params->s_2);
  ASSERT_ALWAYS (params->s_1 < params->l);
  nr = params->l - params->s_1; /* Number of points we evaluate */

  if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
      return ECM_ERROR;

  /* Allocate all the memory we'll need */
  /* Allocate the correct amount of space for each mpz_t or the
     reallocations will up to double the time for stage 2! */
  mpz_init (mt);
  mpres_init (mr, modulus);
  lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
				     the leading 1 monomial for each factor */
  F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
  h = malloc ((params->s_1 + 1) * sizeof (mpz_t));
  if (h == NULL)
    {
      fprintf (stderr, "Cannot allocate memory in pm1fs2\n");
      exit (1);
    }
  lenG = params->l;
  g = init_list2 (lenG, (unsigned int) abs (modulus->bits));
  lenR = nr;
  R = init_list2 (lenR, (unsigned int) abs (modulus->bits));
  tmplen = 3UL * params->l + list_mul_mem (params->l / 2);
  outputf (OUTPUT_DEVVERBOSE, "tmplen = %lu\n", tmplen);
  if (TMulGen_space (params->l - 1, params->s_1, lenR) + 12 > tmplen)
    {
      tmplen = TMulGen_space (params->l - 1, params->s_1 - 1, lenR) + 12;
      /* FIXME: It appears TMulGen_space() returns a too small value! */
      outputf (OUTPUT_DEVVERBOSE, "With TMulGen_space, tmplen = %lu\n",
	       tmplen);
    }
#ifdef SHOW_TMP_USAGE
  tmp = init_list (tmplen);
#else
  tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
#endif

  mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
  outputf (OUTPUT_TRACE,
	   "N = %Zd; X = Mod(%Zd, N); /* PARI */\n",
	   modulus->orig_modulus, mt);


  /* Compute the polynomial f(x) = \prod_{k_1 in S_1} (x - X^{2k_1}) */
  outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");

  timestart = cputime ();

  /* First compute X + 1/X */
  mpres_invert (mr, X, modulus);
  mpres_add (mr, mr, X, modulus);

  i = poly_from_sets_V (F, mr, S_1, tmp, tmplen, modulus, NULL, NULL);
  ASSERT_ALWAYS(2 * i == params->s_1);
  ASSERT(mpz_cmp_ui (F[i], 1UL) == 0);
  free (S_1);
  S_1 = NULL;

  outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
  if (test_verbose (OUTPUT_TRACE))
    {
      for (i = 0; i < params->s_1 / 2 + 1; i++)
	outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
      outputf (OUTPUT_TRACE, "f(x) = f_0");
      for (i = 1; i < params->s_1 / 2 + 1; i++)
	outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
      outputf (OUTPUT_TRACE, "/* PARI */ \n");
    }

  mpz_set_ui (mt, params->P);
  mpres_pow (mr, X, mt, modulus); /* mr = X^P */
  pm1_sequence_h (F, NULL, F, mr, params->s_1 / 2 + 1, modulus, NULL);

  /* Make a symmetric copy of F in h. It will have length
     s_1 + 1 = 2*lenF - 1 */
  /* I.e. with F = [3, 2, 1], s_1 = 4, we want h = [1, 2, 3, 2, 1] */
  for (i = 0; i < params->s_1 / 2 + 1; i++)
    *(h[i]) = *(F[params->s_1 / 2 - i]); /* Clone the mpz_t. */
  for (i = 0; i < params->s_1 / 2; i++)
    *(h[i + params->s_1 / 2 + 1]) = *(F[i + 1]);
  if (test_verbose (OUTPUT_TRACE))
    {
      for (i = 0; i < params->s_1 + 1; i++)
        outputf (OUTPUT_VERBOSE, "h_%lu = %Zd; /* PARI */\n", i, h[i]);
      outputf (OUTPUT_VERBOSE, "h(x) = h_0");
      for (i = 1; i < params->s_1 + 1; i++)
        outputf (OUTPUT_VERBOSE, " + h_%lu * x^%lu", i, i);
      outputf (OUTPUT_VERBOSE, " /* PARI */\n");
    }

  for (l = 0; l < params->s_2; l++)
    {
      const unsigned long M = params->l - 1L - params->s_1 / 2L;
      outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
               l + 1, params->s_2);
      pm1_sequence_g (g, NULL, X, params->P, M, params->l,
		      params->m_1, S_2->elem[l], modulus, NULL);

      /* Do the convolution */
      /* Use the transposed "Middle Product" algorithm */
      /* TMulGen reverses the first input sequence, but that doesn't matter
	 since h is symmetric. */

      outputf (OUTPUT_VERBOSE, "TMulGen of g and h");
      timestart = cputime ();
      ASSERT(tmplen >= TMulGen_space (nr - 1, params->l - 1, params->s_1));

      /* Computes rev(h)*g, stores coefficients of x^(s_1) to
	 x^(s_1+nr-1) = x^(len-1) */
      if (TMulGen (R, nr - 1, h, params->s_1, g, params->l - 1, tmp,
		   modulus->orig_modulus) < 0)
	{
	  outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
		   "of memory)\n");
	  youpi = ECM_ERROR;
	  break;
	}
      list_mod (R, R, nr, modulus->orig_modulus);

      outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);

#if 0 && defined(WANT_ASSERT)

      /* See if R[i] is correct, with a test that works even if i0 != 0 */
      /* More expensive self-test */
      /* alpha = beta*(i0 + l*nr) */
      /* This code is old and probably does not work. */

      outputf (OUTPUT_VERBOSE, "Verifying all results (slow)");
      for (i = 0; i < nr; i++)
	{
	  mpz_set_ui (mt, nr * l);
	  mpz_add (mt, mt, root_params->i0);
	  mpz_add_ui (mt, mt, i);
	  mpz_mul_ui (mt, mt, beta);
	  mpres_get_z (tmp[0], X, modulus);
	  mpz_powm (tmp[0], tmp[0], mt, modulus->orig_modulus);
	  /* Hence, tmp[0] = X^(alpha + i * beta) */
	  list_eval_poly (tmp[1], F, tmp[0], dF, 1, modulus->orig_modulus,
			  tmp + 2);

	  mpz_set_ui (mt, i);
	  mpz_mul_ui (mt, mt, i);
	  mpz_mul_ui (mt, mt, beta / 2); /* h(i) = beta*i^2/2 */
	  mpres_get_z (tmp[0], X, modulus);
	  mpz_powm (tmp[0], tmp[0], mt, modulus->orig_modulus); /* X^h(1) */
	  mpz_mul (tmp[0], tmp[0], R[i]);
	  mpz_mod (tmp[0], tmp[0], modulus->orig_modulus);
	  if (mpz_cmp (tmp[0], tmp[1]) != 0)
	    {
	      outputf (OUTPUT_ERROR, "Result in R[%ld] incorrect.\n", i);
	      outputf (OUTPUT_ERROR, "R[%ld] = %Zd\n", i, R[i]);
	      abort ();
	    }
	}
      outputf (OUTPUT_VERBOSE, " - everything's correct! :-D\n");
#endif

      if (test_verbose (OUTPUT_TRACE))
	{
	  for (i = 0; i < nr; i++)
	    outputf (OUTPUT_TRACE, "r_%lu = %Zd; /* PARI */\n", i, R[i]);
	}

      outputf (OUTPUT_VERBOSE, "Computing product of F(g_i)");
      timestart = cputime ();

      {
	mpres_t tmpres, tmpprod;
	mpres_init (tmpres, modulus);
	mpres_init (tmpprod, modulus);
	mpres_set_z_for_gcd (tmpprod, R[0], modulus);
	for (i = 1; i < nr; i++)
	  {
	    mpres_set_z_for_gcd (tmpres, R[i], modulus);
	    mpres_mul (tmpprod, tmpprod, tmpres, modulus);
	  }
        mpres_get_z (tmp[1], tmpprod, modulus); /* For printing */
	mpres_gcd (tmp[0], tmpprod, modulus);
	mpres_clear (tmpprod, modulus);
	mpres_clear (tmpres, modulus);
      }

      outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
      outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", tmp[1]);

      if (mpz_cmp_ui (tmp[0], 1UL) > 0)
	{
	  mpz_set (f, tmp[0]);
	  youpi = ECM_FACTOR_FOUND_STEP2;
	  break;
	}
    }

#ifdef SHOW_TMP_USAGE
  for (i = tmplen - 1; i > 0; i--)
    if (tmp[i]->_mp_alloc > 1)
      break;
  outputf (OUTPUT_DEVVERBOSE, "Highest used temp element is tmp[%lu]\n", i);
#endif

  free (S_2);
  free (h);
  clear_list (F, lenF);
  clear_list (g, lenG);
  clear_list (R, lenR);
  clear_list (tmp, tmplen);

  mpz_clear (mt);
  mpres_clear (mr, modulus);

  outputf (OUTPUT_NORMAL, "Step 2");
  /* In normal output mode, print only cpu time as we always have.
     In verbose mode, print real time as well if we used multi-threading */
  if (test_verbose (OUTPUT_VERBOSE))
    print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
  else
    print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);

  return youpi;
}


int
pm1fs2_ntt (mpz_t f, const mpres_t X, mpmod_t modulus,
	const faststage2_param_t *params)
{
  unsigned long nr;
  unsigned long l, lenF;
  sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
                       progressions of prime length) */
  set_long_t *S_2; /* This is stored as a regular set */
  listz_t F;   /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
		  degree s_1. It is symmetric, so has only s_1 / 2 + 1
		  distinct coefficients. The sequence h_j will be stored in
		  the same memory and won't be a monic polynomial, so the
		  leading 1 monomial of F will be stored explicitly. Hence we
		  need s_1 / 2 + 1 entries. */
  mpzspm_t ntt_context;
  mpzspv_t g_ntt, h_ntt;
  mpz_t mt;   /* All-purpose temp mpz_t */
  mpz_t product; /* Product of each multi-point evaluation */
  mpz_t *product_ptr = NULL;
  mpres_t tmpres; /* All-purpose temp mpres_t */
  int youpi = ECM_NO_FACTOR_FOUND;
  long timetotalstart, realtotalstart, timestart, realstart;

  timetotalstart = cputime ();
  realtotalstart = realtime ();

  ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
  ASSERT_ALWAYS (params->s_1 < params->l);
  nr = params->l - params->s_1; /* Number of points we evaluate */

  /* Prepare NTT for computing the h sequence, its DCT-I, and the convolution
     with g. We need NTT of transform length l. We do it here at the start
     of stage 2 so that in case of a "not enough primes" condition,
     we don't have to wait until after F is built to get the error. */

  ntt_context = mpzspm_init (params->l, modulus->orig_modulus);
  if (ntt_context == NULL)
    {
      outputf (OUTPUT_ERROR, "Could not initialise ntt_context, "
               "presumably out of memory\n");
      return ECM_ERROR;
    }

  print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for evaluation = ",
		    ntt_context);

  if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
      return ECM_ERROR;

  /* Allocate all the memory we'll need for building f */
  mpz_init (mt);
  mpres_init (tmpres, modulus);
  lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
				     the leading 1 monomial for each factor */
  {
    /* in some cases the above value of lenF is not enough, for example with
       s_1 = 10, which gives lenF = 7, but 9 entries are needed */
    unsigned long mem = mem_poly_from_sets_V (S_1);
    if (mem > lenF)
      lenF = mem;
  }

  F = init_list2 (lenF, (unsigned int) abs (modulus->bits));

  mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
  outputf (OUTPUT_TRACE,
	   "N = %Zd; X = Mod(%Zd, N); /* PARI */\n",
	   modulus->orig_modulus, mt);

#if 0 && defined (WANT_ASSERT)
  /* For this self test run with a large enough B2 so that enough memory
     is allocated for tmp and F_ntt, otherwise it segfaults. */
  {
    int testlen = 255;
    int i, j;
    /* A test of ntt_sqr_reciprocal() */
    for (j = 1; j <= testlen; j++)
      {
        outputf (OUTPUT_VERBOSE,
                 "Testing ntt_sqr_reciprocal() for input degree %d\n",
                 j - 1);
        for (i = 0; i < j; i++)
          mpz_set_ui (tmp[i], 1UL);
        ntt_sqr_reciprocal (tmp, tmp, F_ntt, (spv_size_t) j, ntt_context_F);
        for (i = 0; i < 2 * j - 1; i++)
          {
            ASSERT (mpz_cmp_ui (tmp[i], 2 * j - 1 - i) == 0);
          }
      }
    outputf (OUTPUT_VERBOSE,
             "Test of ntt_sqr_reciprocal() for input degree 2 ... %d passed\n",
             testlen - 1);
  }
#endif


  /* First compute X + 1/X */
  mpres_invert (tmpres, X, modulus);
  mpres_add (tmpres, tmpres, X, modulus);

  if (build_F_ntt (F, tmpres, S_1, params, modulus) == ECM_ERROR)
    {
      free (S_1);
      free (S_2);
      mpz_clear (mt);
      mpres_clear (tmpres, modulus);
      mpzspm_clear (ntt_context);
      clear_list (F, lenF);
      return ECM_ERROR;
    }

  free (S_1);
  S_1 = NULL;

  h_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);

  mpz_set_ui (mt, params->P);
  mpres_pow (tmpres, X, mt, modulus); /* tmpres = X^P */
  pm1_sequence_h (NULL, h_ntt, F, tmpres, params->s_1 / 2 + 1, modulus,
		  ntt_context);

  clear_list (F, lenF);
  g_ntt = mpzspv_init (params->l, ntt_context);

  /* Compute the DCT-I of h */
  outputf (OUTPUT_VERBOSE, "Computing DCT-I of h");
#ifdef _OPENMP
  outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
#endif
  timestart = cputime ();
  realstart = realtime ();

  mpzspv_to_dct1 (h_ntt, h_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
                  g_ntt, ntt_context);
  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);

  if (test_verbose (OUTPUT_RESVERBOSE))
    {
      mpz_init (product);
      product_ptr = &product;
    }

  for (l = 0; l < params->s_2; l++)
    {
      const unsigned long M = params->l - 1L - params->s_1 / 2L;

      outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
               l + 1, params->s_2);
      /* Compute the coefficients of the polynomial g(x) */
      pm1_sequence_g (NULL, g_ntt, X, params->P, M, params->l,
		      params->m_1, S_2->elem[l], modulus, ntt_context);

      /* Do the convolution */
      outputf (OUTPUT_VERBOSE, "Computing g*h");
#ifdef _OPENMP
      outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
#endif
      timestart = cputime ();
      realstart = realtime ();
      mpzspv_mul_by_dct (g_ntt, h_ntt, params->l, ntt_context,
        NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
      print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);

      /* Compute GCD of N and coefficients of product polynomial */
      ntt_gcd (mt, product_ptr, g_ntt, params->s_1 / 2, NULL, nr, ntt_context,
	       modulus);

      outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", product);

      /* If we found a factor, stop */
      if (mpz_cmp_ui (mt, 1UL) > 0)
	{
	  mpz_set (f, mt);
	  youpi = ECM_FACTOR_FOUND_STEP2;
	  break;
	}
    }

  if (test_verbose (OUTPUT_RESVERBOSE))
    {
      product_ptr = NULL;
      mpz_clear (product);
    }
  mpzspv_clear (g_ntt, ntt_context);
  mpzspv_clear (h_ntt, ntt_context);
  mpzspm_clear (ntt_context);
  mpres_clear (tmpres, modulus);
  mpz_clear (mt);
  free (S_2);

  outputf (OUTPUT_NORMAL, "Step 2");
  /* In normal output mode, print only cpu time as we always have.
     In verbose mode, print real time as well if we used multi-threading */
  if (test_verbose (OUTPUT_VERBOSE))
    print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
  else
    print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);

  return youpi;
}


static void
gfp_ext_print (const mpres_t r_x, const mpres_t r_y, mpmod_t modulus,
	       const int verbose)
{
  mpz_t t1, t2;

  if (!test_verbose (verbose))
    return;

  mpz_init (t1);
  mpz_init (t2);
  mpres_get_z (t1, r_x, modulus);
  mpres_get_z (t2, r_y, modulus);
  outputf (verbose, "Mod(%Zd, N) + Mod(%Zd, N) * w", t1, t2);

  mpz_clear (t1);
  mpz_clear (t2);
}


/* Multiplies (a_0 + a_1*sqrt(Delta)) * (b_0 + b_1*sqrt(Delta))
   using four multiplications. Result goes in (r_0 + r_1*sqrt(Delta)).
   a_0, b_0, r_0 as well as a_1, b_1, r_1 may overlap arbitrarily. t[0], t[1],
   t[2] and Delta must not overlap with anything. */
/* FIXME: is there a faster multiplication routine if both inputs have
   norm 1? */

static void
gfp_ext_mul (mpres_t r_0, mpres_t r_1, const mpres_t a_0, const mpres_t a_1,
	     const mpres_t b_0, const mpres_t b_1, const mpres_t Delta,
	     mpmod_t modulus, ATTRIBUTE_UNUSED const unsigned long tmplen,
	     mpres_t *tmp)
{
  ASSERT (tmplen >= 2);
  if (0 && test_verbose (OUTPUT_TRACE))
    {
      mpz_t t;
      mpz_init (t);
      mpres_get_z (t, Delta, modulus);
      outputf (OUTPUT_TRACE, "/* gfp_ext_mul */ w = quadgen (4*%Zd); "
               "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
      mpz_clear (t);
      outputf (OUTPUT_TRACE, "/* gfp_ext_mul */ (");
      gfp_ext_print (a_0, a_1, modulus, OUTPUT_TRACE);
      outputf (OUTPUT_TRACE, ") * (");
      gfp_ext_print (b_0, b_1, modulus, OUTPUT_TRACE);
    }

  mpres_add (tmp[0], a_0, a_1, modulus);
  mpres_add (tmp[1], b_0, b_1, modulus);
  mpres_mul (tmp[1], tmp[0], tmp[1], modulus); /* t[1] = (a_0+a_1)*(b_0+b_1) =
					    a_0*b_0 + a_0*b_1 + a_1*b_0 +
					    a_1*b_1 */

  mpres_mul (r_0, a_0, b_0, modulus);    /* r_0 = a_0*b_0. We don't need a_0
					    or b_0 any more now */
  mpres_sub (tmp[1], tmp[1], r_0, modulus);  /* t[1] = a_0*b_1 + a_1*b_0 +
						a_1*b_1 */

  mpres_mul (tmp[0], a_1, b_1, modulus);   /* t[0] = a_1*b_1. We don't need
					      a_1 or b_1 any more now */
  mpres_sub (r_1, tmp[1], tmp[0], modulus);  /* r_1 == a_0*b_1 + a_1*b_0 */

  mpres_mul (tmp[0], tmp[0], Delta, modulus); /* t[0] = a_1*b_1*Delta */
  mpres_add (r_0, r_0, tmp[0], modulus);   /* r_0 = a_0*b_0 + a_1*b_1*Delta */

  if (0 && test_verbose (OUTPUT_TRACE))
    {
      outputf (OUTPUT_TRACE, ") == ");
      gfp_ext_print (r_0, r_1, modulus, OUTPUT_TRACE);
      outputf (OUTPUT_TRACE, " /* PARI C */\n");
    }
}


/* Computes (a_0 + a_1 * sqrt(Delta))^2, where the norm
   (a_0^2 - a_1^2*Delta) is assumed to be equal to 1. Hence
   (a_0 + a_1 * sqrt(Delta))^2 = a_0^2 + 2*a_0*a_1*sqrt(Delta) + a_1^2*Delta
   and a_0^2 + a_1^2*Delta = a_0^2 + a_1^2*Delta + norm - 1 = 2*a_0^2 - 1.
   a_0 and r_0, as well as a_1 and r_1 may overlap */

static void
gfp_ext_sqr_norm1 (mpres_t r_0, mpres_t r_1, const mpres_t a_0,
		   const mpres_t a_1, mpmod_t modulus)
{
  ASSERT (a_0 != r_1);  /* a_0 is read after r_1 is written */

  if (pari)
    gmp_printf ("/* gfp_ext_sqr_norm1 */ (%Zd + %Zd * w)^2 %% N == ", a_0, a_1);

  mpres_mul (r_1, a_0, a_1, modulus);
  mpres_add (r_1, r_1, r_1, modulus);       /* r_1 = 2*a_0*a_1 */

  mpres_sqr (r_0, a_0, modulus);
  mpres_add (r_0, r_0, r_0, modulus);
  mpres_sub_ui (r_0, r_0, 1UL, modulus);    /* r_0 = 2*a_0^2 - 1 */

  if (pari)
    gmp_printf ("(%Zd + %Zd * w) %% N /* PARI C */\n", r_0, r_1);
}


/* Raise (a0 + a1*sqrt(Delta)) to the power e which is a signed long int.
   (a0 + a1*sqrt(Delta)) is assumed to have norm 1, i.e.
   a0^2 - a1^2*Delta == 1. The result is (r0 * r1*sqrt(Delta)).
   a0, a1, r0 and r1 must not overlap */

static void
gfp_ext_pow_norm1_sl (mpres_t r0, mpres_t r1, const mpres_t a0,
                      const mpres_t a1, const long e, const mpres_t Delta,
                      mpmod_t modulus, unsigned long tmplen, mpres_t *tmp)
{
  const unsigned long abs_e = labs (e);
  unsigned long mask = ~0UL - (~0UL >> 1);

  ASSERT (a0 != r0 && a1 != r0 && a0 != r1 && a1 != r1);

  if (e == 0)
    {
      mpres_set_ui (r0, 1UL, modulus);
      mpres_set_ui (r1, 0UL, modulus);
      return;
    }

  /* If e < 0, we want 1/(a0 + a1*sqrt(Delta)). By extending with
     a0 - a1*sqrt(Delta), we get
     (a0 - a1*sqrt(Delta)) / (a0^2 - a1^2 * Delta), but that denomiator
     is the norm which is known to be 1, so the result is
     a0 - a1*sqrt(Delta). */

  while ((abs_e & mask) == 0UL)
    mask >>= 1;

  mpres_set (r0, a0, modulus);
  mpres_set (r1, a1, modulus);

  while (mask > 1UL)
    {
      gfp_ext_sqr_norm1 (r0, r1, r0, r1, modulus);
      mask >>= 1;
      if (abs_e & mask)
	gfp_ext_mul (r0, r1, r0, r1, a0, a1, Delta, modulus, tmplen, tmp);
    }

  if (e < 0)
    mpres_neg (r1, r1, modulus);

  if (0 && test_verbose (OUTPUT_TRACE))
    {
      mpz_t t;
      mpz_init (t);
      mpres_get_z (t, Delta, modulus);
      outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1_sl */ w = quadgen (4*%Zd); "
               "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
      mpz_clear (t);
      outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1_sl */ (");
      gfp_ext_print (a0, a1, modulus, OUTPUT_TRACE);
      outputf (OUTPUT_TRACE, ")^(%ld) == ", e);
      gfp_ext_print (r0, r1, modulus, OUTPUT_TRACE);
      outputf (OUTPUT_TRACE, " /* PARI C */\n");
    }
}


/* Same, but taking an mpz_t argument for the exponent */

static void
gfp_ext_pow_norm1 (mpres_t r0, mpres_t r1, const mpres_t a0,
                   const mpres_t a1, mpz_t e, const mpres_t Delta,
                   mpmod_t modulus, unsigned long tmplen, mpres_t *tmp)
{
  mpz_t abs_e;
  unsigned long idx;

  ASSERT (a0 != r0 && a1 != r0 && a0 != r1 && a1 != r1);

  if (mpz_sgn (e) == 0)
    {
      mpres_set_ui (r0, 1UL, modulus);
      mpres_set_ui (r1, 0UL, modulus);
      return;
    }

  mpz_init (abs_e);
  mpz_abs (abs_e, e);
  idx = mpz_sizeinbase (abs_e, 2) - 1; /* Thus mpz_tstbit (abs_e, idx) == 1 */
  ASSERT (mpz_tstbit (abs_e, idx) == 1);

  mpres_set (r0, a0, modulus);
  mpres_set (r1, a1, modulus);

  while (idx > 0UL)
    {
      gfp_ext_sqr_norm1 (r0, r1, r0, r1, modulus);
      idx--;
      if (mpz_tstbit (abs_e, idx))
	gfp_ext_mul (r0, r1, r0, r1, a0, a1, Delta, modulus, tmplen, tmp);
    }

  if (mpz_sgn (e) < 0)
    mpres_neg (r1, r1, modulus);

  mpz_clear (abs_e);

  if (test_verbose (OUTPUT_TRACE))
    {
      mpz_t t;
      mpz_init (t);
      mpres_get_z (t, Delta, modulus);
      outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1 */ w = quadgen (4*%Zd); "
               "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
      mpz_clear (t);
      outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1 */ (");
      gfp_ext_print (a0, a1, modulus, OUTPUT_TRACE);
      outputf (OUTPUT_TRACE, ")^(%Zd) == ", e);
      gfp_ext_print (r0, r1, modulus, OUTPUT_TRACE);
      outputf (OUTPUT_TRACE, " /* PARI C */\n");
    }
}

#if 0
/* Compute r[i] = a^((k+i)^2) for i = 0, 1, ..., l-1, where "a" is an
   element of norm 1 in the quadratic extension ring */

static void
gfp_ext_rn2 (mpres_t *r_x, mpres_t *r_y, const mpres_t a_x, const mpres_t a_y,
	     const long k, const unsigned long l, const mpres_t Delta,
	     mpmod_t modulus, const unsigned long origtmplen, mpres_t *origtmp)
{
  mpres_t *r2_x = origtmp, *r2_y = origtmp + 2, *v = origtmp + 4,
    *V2 = origtmp + 6;
  const unsigned long newtmplen = origtmplen - 7;
  mpres_t *newtmp = origtmp + 7;
  unsigned long i;

  if (l == 0UL)
    return;

  ASSERT (origtmplen >= 8UL);

  if (pari)
    gmp_printf ("/* In gfp_ext_rn2 */ ; a = %Zd + %Zd * w; /* PARI */\n",
		a_x, a_y, modulus->orig_modulus);

  /* Compute r[0] = a^(k^2). We do it by two exponentiations by k and use
     v[0] and v[1] as temp storage */
  gfp_ext_pow_norm1_sl (v[0], v[1], a_x, a_y, k, Delta, modulus, newtmplen,
		     newtmp);
  gfp_ext_pow_norm1_sl (r_x[0], r_y[0], v[0], v[1], k, Delta, modulus,
		     newtmplen, newtmp);
  if (pari)
    gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2) %% N == (%Zd + %Zd * w) %% N "
		"/* PARI C */\n", k, r_x[0], r_y[0]);

  /* Compute r[1] = a^((k+1)^2) = a^(k^2 + 2k + 1)*/
  if (l > 1)
    {
      /* v[0] + v[1]*sqrt(Delta) still contains a^k */
      gfp_ext_sqr_norm1 (r_x[1], r_y[1], v[0], v[1], modulus);
      /* Now r[1] = a^(2k) */
      gfp_ext_mul (r_x[1], r_y[1], r_x[1], r_y[1], r_x[0], r_y[0], Delta,
		   modulus, newtmplen, newtmp);
      /* Now r[1] = a^(k^2 + 2k) */
      gfp_ext_mul (r_x[1], r_y[1], r_x[1], r_y[1], a_x, a_y, Delta, modulus,
		   newtmplen, newtmp);
      /* Now r[1] = a^(k^2 + 2k + 1) = a^((k+1)^2) */
    }
  if (pari)
    gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2) %% N == (%Zd + %Zd * w) %% N "
		"/* PARI C */\n", k + 1, r_x[1], r_y[1]);

  /* Compute r2[0] = a^(k^2+2) = a^(k^2) * a^2 */
  gfp_ext_sqr_norm1 (v[0], v[1], a_x, a_y, modulus);
  gfp_ext_mul (r2_x[0], r2_y[0], r_x[0], r_y[0], v[0], v[1], Delta, modulus,
	       newtmplen, newtmp);
  if (pari)
    gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2+2) %% N == (%Zd + %Zd * w) %% N "
		"/* PARI C */\n", k, r2_x[0], r2_y[0]);
  /* Compute a^((k+1)^2+2) = a^((k+1)^2) * a^2 */
  gfp_ext_mul (r2_x[1], r2_y[1], r_x[1], r_y[1], v[0], v[1], Delta, modulus,
	       newtmplen, newtmp);
  if (pari)
    gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2+2) %% N == (%Zd + %Zd * w) %% N "
		"/* PARI C */\n", k + 1, r2_x[1], r2_y[1]);

  /* Compute V_2(a + 1/a). Since 1/a = a_x - a_y, we have a+1/a = 2*a_x.
     V_2(x) = x^2 - 2, so we want 4*a_x^2 - 2. */
  mpres_add (*V2, a_x, a_x, modulus); /* V2 = a + 1/a  = 2*a_x*/
  V (v[0], *V2, 2 * k + 1, modulus);  /* v[0] = V_{2k+1} (a + 1/a) */
  V (v[1], *V2, 2 * k + 3, modulus);  /* v[0] = V_{2k+3} (a + 1/a) */
  mpres_sqr (*V2, *V2, modulus);      /* V2 = 4*a_x^2 */
  mpres_sub_ui (*V2, *V2, 2UL, modulus); /* V2 = 4*a_x^2 - 2 */
  if (pari)
    {
      gmp_printf ("/* In gfp_ext_rn2 */ ((a + 1/a)^2 - 2) %% N == "
		  "%Zd %% N /* PARI C */\n", *V2);
      gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
		  "/* PARI C */\n", 2 * k + 1, v[0]);
      gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
		  "/* PARI C */\n", 2 * k + 3, v[1]);
    }

  /* Compute the remaining a^((k+i)^2) values according to Peter's
     recurrence */

  for (i = 2; i < l; i++)
    {
      /* r[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i taken
	 modulo 2 */
      mpres_mul (r_x[i], r2_x[1 - i % 2], v[i % 2], modulus);
      mpres_sub (r_x[i], r_x[i], r2_x[i % 2], modulus);
      mpres_mul (r_y[i], r2_y[1 - i % 2], v[i % 2], modulus);
      mpres_sub (r_y[i], r_y[i], r2_y[i % 2], modulus);

      /* r2[i] = r2[i-1] * v[i-1] - r[i-2] */
      mpres_mul (r2_x[i % 2], r2_x[1 - i % 2], v[1 - i % 2], modulus);
      mpres_sub (r2_x[i % 2], r2_x[i % 2], r_x[i - 2], modulus);
      mpres_mul (r2_y[i % 2], r2_y[1 - i % 2], v[1 - i % 2], modulus);
      mpres_sub (r2_y[i % 2], r2_y[i % 2], r_y[i - 2], modulus);

      /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
      mpres_mul (newtmp[0], v[1 - i % 2], *V2, modulus);
      mpres_sub (v[i % 2], newtmp[0], v[i % 2], modulus);
      if (pari)
	gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
		    "/* PARI C */\n", 2 * (k + i) + 1, v[i % 2]);
    }
}
#endif

/* Compute g_i = x_0^{M-i} * r^{(M-i)^2} for 0 <= i < l.
   x_0 = b_1^{2*k_2 + (2*m_1 + 1) * P}. r = b_1^P. */

static void
pp1_sequence_g (listz_t g_x, listz_t g_y, mpzspv_t g_x_ntt, mpzspv_t g_y_ntt,
		const mpres_t b1_x, const mpres_t b1_y, const unsigned long P,
		const mpres_t Delta, const long M_param,
		const unsigned long l_param, const mpz_t m_1, const long k_2,
		const mpmod_t modulus_param, const mpzspm_t ntt_context)
{
  const unsigned long tmplen = 3;
  const int want_x = (g_x != NULL || g_x_ntt != NULL);
  const int want_y = (g_y != NULL || g_y_ntt != NULL);
  mpres_t r_x, r_y, x0_x, x0_y, v2,
      r1_x[2], r1_y[2], r2_x[2], r2_y[2],
      v[2], tmp[3];
  mpz_t mt;
  mpmod_t modulus; /* Thread-local copy of modulus_param */
  unsigned long i, l = l_param, offset = 0;
  long M = M_param;
  long timestart, realstart;
  int want_output = 1;

  outputf (OUTPUT_VERBOSE, "Computing %s%s%s",
	   (want_x) ? "g_x" : "",
	   (want_x && want_y) ? " and " : "",
	   (want_y) ? "g_y" : "");
  timestart = cputime ();
  realstart = realtime ();

#ifdef _OPENMP
#pragma omp parallel if (l > 100) private(r_x, r_y, x0_x, x0_y, v2, r1_x, r1_y, r2_x, r2_y, v, tmp, mt, modulus, i, l, offset, M, want_output)
  {
    /* When multi-threading, we adjust the parameters for each thread */

    const int nr_chunks = omp_get_num_threads();
    const int thread_nr = omp_get_thread_num();

    l = (l_param - 1) / nr_chunks + 1;
    offset = thread_nr * l;
    if (offset <= l_param)
      l = MIN(l, l_param - offset);
    else
      l = 0;
    M = M_param - (long) offset;

    want_output = (omp_get_thread_num() == 0);
    if (want_output)
      outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
#endif
    mpmod_init_set (modulus, modulus_param);
    mpres_init (r_x, modulus);
    mpres_init (r_y, modulus);
    mpres_init (x0_x, modulus);
    mpres_init (x0_y, modulus);
    mpres_init (v2, modulus);
    for (i = 0; i < 2UL; i++)
      {
	mpres_init (r1_x[i], modulus);
	mpres_init (r1_y[i], modulus);
	mpres_init (r2_x[i], modulus);
	mpres_init (r2_y[i], modulus);
	mpres_init (v[i], modulus);
      }
    for (i = 0; i < tmplen; i++)
      mpres_init (tmp[i], modulus);
    mpz_init (mt);

    if (want_output && test_verbose (OUTPUT_TRACE))
      {
	mpres_get_z (mt, Delta, modulus);
	outputf (OUTPUT_TRACE,
		 "\n/* pp1_sequence_g */ w = quadgen (4*%Zd); P = %lu; "
		 "M = %ld; k_2 = %ld; m_1 = %Zd; N = %Zd; /* PARI */\n",
		 mt, P, M, k_2, m_1, modulus->orig_modulus);

	outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ b_1 = ");
	gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
	outputf (OUTPUT_TRACE, "; /* PARI */\n");
	outputf (OUTPUT_TRACE,
		 "/* pp1_sequence_g */ r = b_1^P; /* PARI */\n");
	outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ "
		 "x_0 = b_1^(2*k_2 + (2*m_1 + 1) * P); /* PARI */\n");
	outputf (OUTPUT_TRACE,
		 "/* pp1_sequence_g */ addrec(x) = x + 1/x; /* PARI */\n");
      }

    /* Compute r */
    gfp_ext_pow_norm1_sl (r_x, r_y, b1_x, b1_y, P, Delta, modulus,
			  tmplen, tmp);
    if (want_output && test_verbose (OUTPUT_TRACE))
      {
	outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ r == ");
	gfp_ext_print (r_x, r_y, modulus, OUTPUT_TRACE);
	outputf (OUTPUT_TRACE, " /* PARI C */\n");
      }

    /* Compute x0 = x_0 */
    mpz_mul_2exp (mt, m_1, 1UL);
    mpz_add_ui (mt, mt, 1UL);
    mpz_mul_ui (mt, mt, P);
    mpz_add_si (mt, mt, k_2);
    mpz_add_si (mt, mt, k_2); /* mt = 2*k_2 + (2*m_1 + 1) * P */
    gfp_ext_pow_norm1 (x0_x, x0_y, b1_x, b1_y, mt, Delta, modulus,
		       tmplen, tmp);
    if (want_output && test_verbose (OUTPUT_TRACE))
      {
	outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ x_0 == ");
	gfp_ext_print (x0_x, x0_y, modulus, OUTPUT_TRACE);
	outputf (OUTPUT_TRACE, " /* PARI C */\n");
      }


    /* Compute g[1] = r1[0] = x0^M * r^(M^2) = (x0 * r^M)^M.
       We use v[0,1] as temporary storage */
    gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, M, Delta, modulus,
			  tmplen, tmp); /* v[0,1] = r^M */
    gfp_ext_mul (v[0], v[1], v[0], v[1], x0_x, x0_y, Delta, modulus,
		 tmplen, tmp); /* v[0,1] = r^M * x_0 */
    gfp_ext_pow_norm1_sl (r1_x[0], r1_y[0], v[0], v[1], M, Delta, modulus,
			  tmplen, tmp); /* r1[0] = (r^M * x_0)^M */
    if (l > 0)
      {
        if (g_x != NULL)
          mpres_get_z (g_x[offset], r1_x[0], modulus);
        if (g_y != NULL)
          mpres_get_z (g_y[offset], r1_y[0], modulus);
        if (g_x_ntt != NULL)
          {
            mpres_get_z (mt, r1_x[0], modulus);
            mpzspv_from_mpzv (g_x_ntt, offset, &mt, 1UL, ntt_context);
          }
        if (g_y_ntt != NULL)
          {
            mpres_get_z (mt, r1_y[0], modulus);
            mpzspv_from_mpzv (g_y_ntt, offset, &mt, 1UL, ntt_context);
          }
      }


    /* Compute g[1] = r1[1] = x0^(M-1) * r^((M-1)^2) = (x0 * r^(M-1))^(M-1).
       We use v[0,1] as temporary storage. FIXME: simplify, reusing g_0 */
    gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, M - 1, Delta, modulus,
			  tmplen, tmp);
    gfp_ext_mul (v[0], v[1], v[0], v[1], x0_x, x0_y, Delta, modulus,
		 tmplen, tmp);
    gfp_ext_pow_norm1_sl (r1_x[1], r1_y[1], v[0], v[1], M - 1, Delta,
			  modulus, tmplen, tmp);
    if (l > 1)
      {
        if (g_x != NULL)
          mpres_get_z (g_x[offset + 1], r1_x[1], modulus);
        if (g_y != NULL)
          mpres_get_z (g_y[offset + 1], r1_y[1], modulus);
        if (g_x_ntt != NULL)
          {
            mpres_get_z (mt, r1_x[1], modulus);
            mpzspv_from_mpzv (g_x_ntt, offset + 1, &mt, 1UL, ntt_context);
          }
        if (g_y_ntt != NULL)
          {
            mpres_get_z (mt, r1_y[1], modulus);
            mpzspv_from_mpzv (g_y_ntt, offset + 1, &mt, 1UL, ntt_context);
          }
      }


    /* x0 := $x_0 * r^{2M - 3}$ */
    /* We don't need x0 after this so we overwrite it. We use v[0,1] as
       temp storage for $r^{2M - 3}$. */
    gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, 2UL*M - 3UL, Delta, modulus,
			  tmplen, tmp);
    gfp_ext_mul (x0_x, x0_y, x0_x, x0_y, v[0], v[1], Delta, modulus,
		 tmplen, tmp);

    /* Compute r2[0] = r1[0] * r^2 and r2[1] = r1[1] * r^2. */
    /* We only need $r^2$ from here on, so we set r = $r^2$ */
    gfp_ext_sqr_norm1 (r_x, r_y, r_x, r_y, modulus);
    gfp_ext_mul (r2_x[0], r2_y[0], r1_x[0], r1_y[0], r_x, r_y, Delta,
		 modulus, tmplen, tmp);
    gfp_ext_mul (r2_x[1], r2_y[1], r1_x[1], r1_y[1], r_x, r_y, Delta,
		 modulus, tmplen, tmp);

    /* v[1] := $x_0 * r^{2*M - 3} + 1/(x_0 * r^{2M - 3}) */
    mpres_add (v[1], x0_x, x0_x, modulus);
    /* x0 := x0 * r = $x_0 * r^{2M - 1}$ */
    gfp_ext_mul (x0_x, x0_y, x0_x, x0_y, r_x, r_y, Delta, modulus,
		 tmplen, tmp);
    /* v[0] := $x_0 * r^{2M - 1} + 1/(x_0 * r^{2M - 1}) */
    mpres_add (v[0], x0_x, x0_x, modulus);

    /* v2 = V_2 (r + 1/r) = r^2 + 1/r^2 */
    mpres_add (v2, r_x, r_x, modulus);

    /* We don't need the contents of r any more and use it as a temp var */

    for (i = 2; i < l; i++)
      {
	if (want_x)
	  {
	    /* r1[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i
	       taken modulo 2. We store the new r1_x[i] in r_x for now */
	    mpres_mul (r_x, r2_x[1 - i % 2], v[i % 2], modulus);
	    mpres_sub (r_x, r_x,            r2_x[i % 2], modulus);
	    /* r2[i] = r2[i-1] * v[i-1] - r1[i-2] */
	    mpres_mul (r2_x[i % 2], r2_x[1 - i % 2], v[1 - i % 2], modulus);
	    mpres_sub (r2_x[i % 2], r2_x[i % 2],     r1_x[i % 2], modulus);
	    mpres_set (r1_x[i % 2], r_x, modulus); /* FIXME, avoid this copy */
	    if (g_x != NULL)
	      mpres_get_z (g_x[offset + i], r_x, modulus); /* FIXME, avoid these REDC */
	    if (g_x_ntt != NULL)
	      {
		mpres_get_z (mt, r_x, modulus);
		mpzspv_from_mpzv (g_x_ntt, offset + i, &mt, 1UL, ntt_context);
	      }
	  }

	if (want_y)
	  {
	    /* Same for y coordinate */
	    mpres_mul (r_y, r2_y[1 - i % 2], v[i % 2], modulus);
	    mpres_sub (r_y, r_y,             r2_y[i % 2], modulus);
	    mpres_mul (r2_y[i % 2], r2_y[1 - i % 2], v[1 - i % 2], modulus);
	    mpres_sub (r2_y[i % 2], r2_y[i % 2],     r1_y[i % 2], modulus);
	    mpres_set (r1_y[i % 2], r_y, modulus);
	    if (g_y != NULL)
	      mpres_get_z (g_y[offset + i], r_y, modulus); /* Keep r1, r2 in mpz_t ? */
	    if (g_y_ntt != NULL)
	      {
		mpres_get_z (mt, r_y, modulus);
		mpzspv_from_mpzv (g_y_ntt, offset + i, &mt, 1UL, ntt_context);
	      }
	  }

	/* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
	mpres_mul (r_x, v[1 - i % 2], v2, modulus);
	mpres_sub (v[i % 2], r_x, v[i % 2], modulus);
	if (want_output && test_verbose (OUTPUT_TRACE))
	  {
	    mpz_t t;
	    mpz_init (t);
	    mpres_get_z (t, v[i % 2], modulus);
	    outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ "
		     "addrec(x_0 * r^(2*(M-%lu) - 1)) == %Zd /* PARI C */\n",
		     i, t);
	    mpz_clear (t);
	  }
      }

    mpres_clear (r_x, modulus);
    mpres_clear (r_y, modulus);
    mpres_clear (x0_x, modulus);
    mpres_clear (x0_y, modulus);
    mpres_clear (v2, modulus);
    for (i = 0; i < 2; i++)
      {
	mpres_clear (r1_x[i], modulus);
	mpres_clear (r1_y[i], modulus);
	mpres_clear (r2_x[i], modulus);
	mpres_clear (r2_y[i], modulus);
	mpres_clear (v[i], modulus);
      }
    for (i = 0; i < tmplen; i++)
      mpres_clear (tmp[i], modulus);
    mpz_clear (mt);
    mpmod_clear (modulus);
#ifdef _OPENMP
  }
#endif

  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);

  if (g_x != NULL && g_y != NULL && test_verbose(OUTPUT_TRACE))
    {
      for (i = 0; i < l_param; i++)
	{
	  outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ g_%lu = "
		   "x_0^(M-%lu) * r^((M-%lu)^2); /* PARI */", i, i, i);
	  outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ g_%lu == "
		   "%Zd + %Zd*w /* PARI C */\n", i, g_x[i], g_y[i]);
	}
    }
}


/* Compute r[i] = b1^(-P*(k+i)^2) * f_i for i = 0, 1, ..., l-1, where "b1" is
   an element of norm 1 in the quadratic extension ring */

static void
pp1_sequence_h (listz_t h_x, listz_t h_y, mpzspv_t h_x_ntt, mpzspv_t h_y_ntt,
		const listz_t f, const mpres_t b1_x, const mpres_t b1_y,
		const long k_param, const unsigned long l_param,
		const unsigned long P, const mpres_t Delta,
		mpmod_t modulus_param, const mpzspm_t ntt_context)
{
  unsigned long i;
  long timestart, realstart;

  if (l_param == 0UL)
    return;

  ASSERT (f != h_x);
  ASSERT (f != h_y);

  outputf (OUTPUT_VERBOSE, "Computing h_x and h_y");
  timestart = cputime ();
  realstart = realtime ();

  if (test_verbose (OUTPUT_TRACE))
    {
      mpz_t t;
      mpz_init (t);
      mpres_get_z (t, Delta, modulus_param);
      outputf (OUTPUT_TRACE, "\n/* pp1_sequence_h */ N = %Zd; "
	       "Delta = %Zd; w = quadgen (4*Delta); k = %ld; P = %lu; "
	       "/* PARI */\n", modulus_param->orig_modulus, t, k_param, P);
      outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ b_1 = ");
      gfp_ext_print (b1_x, b1_y, modulus_param, OUTPUT_TRACE);
      outputf (OUTPUT_TRACE, "; r = b_1^P; rn = b_1^(-P); /* PARI */\n");
      for (i = 0; i < l_param; i++)
	outputf (OUTPUT_TRACE,
		 "/* pp1_sequence_h */ f_%lu = %Zd; /* PARI */\n", i, f[i]);
      mpz_clear (t);
    }

#ifdef _OPENMP
#pragma omp parallel if (l_param > 100) private(i)
#endif
  {
    const size_t tmplen = 2;
    mpres_t s_x[3], s_y[3], s2_x[2], s2_y[2], v[2], V2, rn_x, rn_y,
      tmp[2];
    mpmod_t modulus; /* Thread-local copy of modulus_param */
    mpz_t mt;
    unsigned long l = l_param, offset = 0;
    long k = k_param;

#ifdef _OPENMP
    /* When multi-threading, we adjust the parameters for each thread */

    const int nr_chunks = omp_get_num_threads();
    const int thread_nr = omp_get_thread_num();

    l = (l_param - 1) / nr_chunks + 1;
    offset = thread_nr * l;
    if (offset <= l_param)
      l = MIN(l, l_param - offset);
    else
      l = 0;

    if (thread_nr == 0)
      outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
    outputf (OUTPUT_TRACE, "\n");
#endif

    /* Each thread computes r[i + offset] = b1^(-P*(k+i+offset)^2) * f_i
       for i = 0, 1, ..., l-1, where l is the adjusted length of each thread */

    /* Test that k+offset does not overflow */
    ASSERT_ALWAYS (offset <= (unsigned long) LONG_MAX &&
		   k <= LONG_MAX - (long) offset);
    k += (long) offset;

    mpz_init (mt);
    /* Make thread-local copy of modulus */
    mpmod_init_set (modulus, modulus_param);

    /* Init the local mpres_t variables */
    for (i = 0; i < 2; i++)
      {
	mpres_init (s_x[i], modulus);
	mpres_init (s_y[i], modulus);
	mpres_init (s2_x[i], modulus);
	mpres_init (s2_y[i], modulus);
	mpres_init (v[i], modulus);
      }
    mpres_init (s_x[2], modulus);
    mpres_init (s_y[2], modulus);
    mpres_init (V2, modulus);
    mpres_init (rn_x, modulus);
    mpres_init (rn_y, modulus);
    for (i = 0; i < (unsigned long) tmplen; i++)
      mpres_init (tmp[i], modulus);

    /* Compute rn = b_1^{-P}. It has the same value for all threads,
       but we make thread local copies anyway. */
    gfp_ext_pow_norm1_sl (rn_x, rn_y, b1_x, b1_y, P, Delta, modulus, tmplen,
			  tmp);
    mpres_neg (rn_y, rn_y, modulus);

    /* Compute s[0] = rn^(k^2) = r^(-k^2). We do it by two
       exponentiations by k and use v[0] and v[1] as temp storage */
    gfp_ext_pow_norm1_sl (v[0], v[1], rn_x, rn_y, k, Delta, modulus,
			  tmplen, tmp);
    gfp_ext_pow_norm1_sl (s_x[0], s_y[0], v[0], v[1], k, Delta, modulus,
			  tmplen, tmp);
    if (test_verbose (OUTPUT_TRACE))
      {
#ifdef _OPENMP
#pragma omp critical
#endif
	{
	  outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2) == ", k);
	  gfp_ext_print (s_x[0], s_y[0], modulus, OUTPUT_TRACE);
	  outputf (OUTPUT_TRACE, " /* PARI C */\n");
	}
      }

    /* Compute s[1] = r^(-(k+1)^2) = r^(-(k^2 + 2k + 1))*/
    if (l > 1)
      {
	/* v[0] + v[1]*sqrt(Delta) still contains rn^k */
	gfp_ext_sqr_norm1 (s_x[1], s_y[1], v[0], v[1], modulus);
	/* Now s[1] = r^(-2k) */
	gfp_ext_mul (s_x[1], s_y[1], s_x[1], s_y[1], s_x[0], s_y[0], Delta,
		     modulus, tmplen, tmp);
	/* Now s[1] = r^(-(k^2 + 2k)) */
	gfp_ext_mul (s_x[1], s_y[1], s_x[1], s_y[1], rn_x, rn_y, Delta,
		     modulus, tmplen, tmp);
	/* Now s[1] = r^(-(k^2 + 2k + 1)) = r^(-(k+1)^2) */
	if (test_verbose (OUTPUT_TRACE))
	  {
#ifdef _OPENMP
#pragma omp critical
#endif
	    {
	      outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2) == ",
		       k + 1);
	      gfp_ext_print (s_x[1], s_y[1], modulus, OUTPUT_TRACE);
	      outputf (OUTPUT_TRACE, " /* PARI C */\n");
	    }
	  }
      }

    /* Compute s2[0] = r^(k^2+2) = r^(k^2) * r^2 */
    gfp_ext_sqr_norm1 (v[0], v[1], rn_x, rn_y, modulus);
    gfp_ext_mul (s2_x[0], s2_y[0], s_x[0], s_y[0], v[0], v[1], Delta, modulus,
		 tmplen, tmp);
    if (test_verbose (OUTPUT_TRACE))
      {
#ifdef _OPENMP
#pragma omp critical
#endif
	{
	  outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2+2) == ", k);
	  gfp_ext_print (s2_x[0], s2_y[0], modulus, OUTPUT_TRACE);
	  outputf (OUTPUT_TRACE, " /* PARI C */\n");
	}
      }

    /* Compute a^((k+1)^2+2) = a^((k+1)^2) * a^2 */
    gfp_ext_mul (s2_x[1], s2_y[1], s_x[1], s_y[1], v[0], v[1], Delta, modulus,
		 tmplen, tmp);
    if (test_verbose (OUTPUT_TRACE))
      {
#ifdef _OPENMP
#pragma omp critical
#endif
	{
	  outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2+2) == ",
		   k + 1);
	  gfp_ext_print (s2_x[1], s2_y[1], modulus, OUTPUT_TRACE);
	  outputf (OUTPUT_TRACE, " /* PARI C */\n");
	}
      }

    /* Compute V_2(r + 1/r). Since 1/r = rn_x - rn_y, we have r+1/r = 2*rn_x.
       V_2(x) = x^2 - 2, so we want 4*rn_x^2 - 2. */
    mpres_add (V2, rn_x, rn_x, modulus); /* V2 = r + 1/r  = 2*rn_x */
    V (v[0], V2, 2 * k + 1, modulus);  /* v[0] = V_{2k+1} (r + 1/r) */
    V (v[1], V2, 2 * k + 3, modulus);  /* v[1] = V_{2k+3} (r + 1/r) */
    mpres_sqr (V2, V2, modulus);       /* V2 = 4*a_x^2 */
    mpres_sub_ui (V2, V2, 2UL, modulus); /* V2 = 4*a_x^2 - 2 */
    if (test_verbose (OUTPUT_TRACE))
      {
#ifdef _OPENMP
#pragma omp critical
#endif
	{
	  mpres_get_z (mt, V2, modulus);
	  outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^2 + 1/r^2 == %Zd "
		   "/* PARI C */\n", mt);
	  mpres_get_z (mt, v[0], modulus);
	  outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^(2*%ld+1) + "
		   "1/r^(2*%ld+1) == %Zd /* PARI C */\n", k, k, mt);
	  mpres_get_z (mt, v[1], modulus);
	  outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^(2*%ld+3) + "
		   "1/r^(2*%ld+3) == %Zd /* PARI C */\n", k, k, mt);
	}
      }

    for (i = 0; i < 2UL && i < l; i++)
      {
	/* Multiply the 2nd coordinate by Delta, so that after the polynomial
	   multipoint evaluation we get x1 + Delta*x2 */
	mpres_mul (s_y[i], s_y[i], Delta, modulus);
	mpres_mul (s2_y[i], s2_y[i], Delta, modulus);

	if (h_x != NULL)
	  mpres_mul_z_to_z (h_x[i + offset], s_x[i], f[i + offset], modulus);
	if (h_y != NULL)
	  mpres_mul_z_to_z (h_y[i + offset], s_y[i], f[i + offset], modulus);
	if (h_x_ntt != NULL)
	  {
	    mpres_mul_z_to_z (mt, s_x[i], f[i + offset], modulus);
	    mpzspv_from_mpzv (h_x_ntt, i + offset, &mt, 1UL, ntt_context);
	  }
	if (h_y_ntt != NULL)
	  {
	    mpres_mul_z_to_z (mt, s_y[i], f[i + offset], modulus);
	    mpzspv_from_mpzv (h_y_ntt, i + offset, &mt, 1UL, ntt_context);
	  }
      }

    /* Compute the remaining r^((k+i)^2) values according to Peter's
       recurrence */

    for (i = 2; i < l; i++)
      {
	if (h_x != NULL || h_x_ntt != NULL)
	  {
	    /* r[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i
	       taken modulo 2 */
	    mpres_mul (s_x[i % 3], s2_x[1 - i % 2], v[i % 2], modulus);
	    mpres_sub (s_x[i % 3], s_x[i % 3], s2_x[i % 2], modulus);

	    /* r2[i] = r2[i-1] * v[i-1] - r[i-2] */
	    mpres_mul (s2_x[i % 2], s2_x[1 - i % 2], v[1 - i % 2], modulus);
	    mpres_sub (s2_x[i % 2], s2_x[i % 2], s_x[(i - 2) % 3], modulus);
	    if (h_x != NULL)
	      mpres_mul_z_to_z (h_x[i + offset], s_x[i % 3], f[i + offset],
				modulus);
	    if (h_x_ntt != NULL)
	      {
		mpres_mul_z_to_z (mt, s_x[i % 3], f[i + offset], modulus);
		mpzspv_from_mpzv (h_x_ntt, i + offset, &mt, 1UL, ntt_context);
	      }
	  }

	if (h_y != NULL || h_y_ntt != NULL)
	  {
	    /* Same for y coordinate */
	    mpres_mul (s_y[i % 3], s2_y[1 - i % 2], v[i % 2], modulus);
	    mpres_sub (s_y[i % 3], s_y[i % 3], s2_y[i % 2], modulus);
	    mpres_mul (s2_y[i % 2], s2_y[1 - i % 2], v[1 - i % 2], modulus);
	    mpres_sub (s2_y[i % 2], s2_y[i % 2], s_y[(i - 2) % 3], modulus);
	    if (h_y != NULL)
	      mpres_mul_z_to_z (h_y[i + offset], s_y[i % 3], f[i + offset],
				modulus);
	    if (h_y_ntt != NULL)
	      {
		mpres_mul_z_to_z (mt, s_y[i % 3], f[i + offset], modulus);
		mpzspv_from_mpzv (h_y_ntt, i + offset, &mt, 1UL, ntt_context);
	      }
	  }

	/* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
	mpres_mul (tmp[0], v[1 - i % 2], V2, modulus);
	mpres_sub (v[i % 2], tmp[0], v[i % 2], modulus);
      }

    /* Clear the local mpres_t variables */
    for (i = 0; i < 2; i++)
      {
	mpres_clear (s_x[i], modulus);
	mpres_clear (s_y[i], modulus);
	mpres_clear (s2_x[i], modulus);
	mpres_clear (s2_y[i], modulus);
	mpres_clear (v[i], modulus);
      }
    mpres_clear (s_x[2], modulus);
    mpres_clear (s_y[2], modulus);
    mpres_clear (V2, modulus);
    mpres_clear (rn_x, modulus);
    mpres_clear (rn_y, modulus);
    for (i = 0; i < tmplen; i++)
      mpres_clear (tmp[i], modulus);

    /* Clear the thread-local copy of modulus */
    mpmod_clear (modulus);

    mpz_clear (mt);
  }

  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);

  if (h_x != NULL && h_y != NULL && test_verbose (OUTPUT_TRACE))
    {
      for (i = 0; i < l_param; i++)
	gmp_printf ("/* pp1_sequence_h */ (rn^((k+%lu)^2) * f_%lu) == "
		    "(%Zd + Mod(%Zd / Delta, N) * w) /* PARI C */\n",
		    i, i, h_x[i], h_y[i]);
    }
}


int
pp1fs2 (mpz_t f, const mpres_t X, mpmod_t modulus,
	const faststage2_param_t *params)
{
  unsigned long nr;
  unsigned long i, l, lenF, lenH, lenG, lenR, tmplen;
  sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
                       progressions of prime length */
  set_long_t *S_2; /* This is stored as a regular set */
  listz_t F;   /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
		  degree s_1. It is symmetric, so has only s_1 / 2 + 1
		  distinct coefficients. The sequence h_j will be stored in
		  the same memory and won't be a monic polynomial, so the
		  leading 1 monomial of F will be stored explicitly. Hence we
		  need s_1 / 2 + 1 entries. */

  listz_t g_x, g_y, fh_x, fh_y, h_x, h_y, tmp, R_x, R_y;
  const unsigned long tmpreslen = 2UL;
  mpres_t b1_x, b1_y, Delta, tmpres[2];
  mpz_t mt;   /* All-purpose temp mpz_t */
  int youpi = ECM_NO_FACTOR_FOUND;
  long timetotalstart, realtotalstart, timestart;

  timetotalstart = cputime ();
  realtotalstart = realtime ();

  ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
  ASSERT_ALWAYS (params->s_1 < params->l);
  nr = params->l - params->s_1; /* Number of points we evaluate */

  if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
      return ECM_ERROR;

  /* Allocate all the memory we'll need */
  /* Allocate the correct amount of space for each mpz_t or the
     reallocations will up to double the time for stage 2! */
  mpz_init (mt);
  mpres_init (b1_x, modulus);
  mpres_init (b1_y, modulus);
  mpres_init (Delta, modulus);
  for (i = 0; i < tmpreslen; i++)
      mpres_init (tmpres[i], modulus);
  lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
				     the leading 1 monomial for each factor */
  lenH = params->s_1 + 1;
  lenG = params->l;
  lenR = nr;
  F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
  fh_x = init_list2 (lenF, (unsigned int) abs (modulus->bits));
  fh_y = init_list2 (lenF, (unsigned int) abs (modulus->bits));
  h_x = malloc (lenH * sizeof (mpz_t));
  h_y = malloc (lenH * sizeof (mpz_t));
  if (h_x == NULL || h_y == NULL)
    {
      fprintf (stderr, "Cannot allocate memory in pp1fs2\n");
      exit (1);
    }
  g_x = init_list2 (lenG, (unsigned int) abs (modulus->bits));
  g_y = init_list2 (lenG, (unsigned int) abs (modulus->bits));
  R_x = init_list2 (lenR, (unsigned int) abs (modulus->bits));
  R_y = init_list2 (lenR, (unsigned int) abs (modulus->bits));
  tmplen = 3UL * params->l + list_mul_mem (params->l / 2) + 20;
  outputf (OUTPUT_DEVVERBOSE, "tmplen = %lu\n", tmplen);
  if (TMulGen_space (params->l - 1, params->s_1, lenR) + 12 > tmplen)
    {
      tmplen = TMulGen_space (params->l - 1, params->s_1 - 1, lenR) + 12;
      /* FIXME: It appears TMulGen_space() returns a too small value! */
      outputf (OUTPUT_DEVVERBOSE, "With TMulGen_space, tmplen = %lu\n",
	       tmplen);
    }

  tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));

  if (test_verbose (OUTPUT_TRACE))
    {
      mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
      outputf (OUTPUT_TRACE,
	       "N = %Zd; X = Mod(%Zd, N); /* PARI */\n",
	       modulus->orig_modulus, mt);
    }

  /* Compute the polynomial f(x) = \prod_{k_1 in S_1} (x - X^{2 k_1}) */
  outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");

  timestart = cputime ();
  i = poly_from_sets_V (F, X, S_1, tmp, tmplen, modulus, NULL, NULL);
  ASSERT_ALWAYS(2 * i == params->s_1);
  ASSERT(mpz_cmp_ui (F[i], 1UL) == 0);
  free (S_1);
  S_1 = NULL;

  outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
  if (test_verbose (OUTPUT_TRACE))
    {
      for (i = 0; i < params->s_1 / 2 + 1; i++)
	outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
      outputf (OUTPUT_TRACE, "f(x) = f_0");
      for (i = 1; i < params->s_1 / 2 + 1; i++)
	outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
      outputf (OUTPUT_TRACE, "/* PARI */ \n");
    }

  /* Compute Delta and b1_x + b1_y * sqrt(Delta) = X) */
  mpres_sqr (Delta, X, modulus);
  mpres_sub_ui (Delta, Delta, 4UL, modulus);
  mpres_div_2exp (b1_x, X, 1, modulus);
  mpres_set_ui (b1_y, 1UL, modulus);
  mpres_div_2exp (b1_y, b1_y, 1, modulus);
  if (test_verbose (OUTPUT_TRACE))
    {
      mpres_get_z (mt, Delta, modulus);
      outputf (OUTPUT_TRACE,
	       "Delta = Mod(%Zd, N); w = quadgen (4*lift(Delta)); b_1 = ", mt);
      gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
      outputf (OUTPUT_TRACE, "; /* PARI */\n");
      outputf (OUTPUT_TRACE, "X == b_1 + 1/b_1 /* PARI C */\n");
    }

  /* Compute the h sequence h_j = b1^(P*-j^2) * f_j for 0 <= j <= s_1 */
  pp1_sequence_h (fh_x, fh_y, NULL, NULL, F, b1_x, b1_y, 0L,
		  params->s_1 / 2 + 1, params->P, Delta, modulus, NULL);
  /* We don't need F(x) any more */
  clear_list (F, lenF);

  /* Make a symmetric copy of fh in h. */
  for (i = 0; i < params->s_1 / 2 + 1; i++)
    {
      *(h_x[i]) = *(fh_x[params->s_1 / 2 - i]); /* Clone the mpz_t */
      *(h_y[i]) = *(fh_y[params->s_1 / 2 - i]);
    }
  for (i = 0; i < params->s_1 / 2; i++)
    {
      *(h_x[i + params->s_1 / 2 + 1]) = *(fh_x[i + 1]);
      *(h_y[i + params->s_1 / 2 + 1]) = *(fh_y[i + 1]);
    }
  if (test_verbose (OUTPUT_TRACE))
    {
      for (i = 0; i < params->s_1 + 1; i++)
	outputf (OUTPUT_VERBOSE, "h_%lu = %Zd + %Zd * w; /* PARI */\n",
		 i, h_x[i], h_y[i]);
    }

  for (l = 0; l < params->s_2; l++)
    {
      const long M = params->l - 1 - params->s_1 / 2;
      outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
               l + 1, params->s_2);
      pp1_sequence_g (g_x, g_y, NULL, NULL, b1_x, b1_y, params->P,
		      Delta, M, params->l, params->m_1, S_2->elem[l],
		      modulus, NULL);

      /* Do the two convolution products */
      outputf (OUTPUT_VERBOSE, "TMulGen of g_x and h_x");
      timestart = cputime ();
      if (TMulGen (R_x, nr - 1, h_x, params->s_1, g_x, params->l - 1, tmp,
		   modulus->orig_modulus) < 0)
	{
	  outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
		   "of memory)\n");
	  youpi = ECM_ERROR;
	  break;
	}
      outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
      outputf (OUTPUT_VERBOSE, "TMulGen of g_y and h_y");
      timestart = cputime ();
      if (TMulGen (R_y, nr - 1, h_y, params->s_1, g_y, params->l - 1, tmp,
		   modulus->orig_modulus) < 0)
	{
	  outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
		   "of memory)\n");
	  youpi = ECM_ERROR;
	  break;
	}
      outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
      for (i = 0; i < nr; i++)
	  mpz_add (R_x[i], R_x[i], R_y[i]);

      timestart = cputime ();
      mpres_set_ui (tmpres[1], 1UL, modulus); /* Accumulate product in
						 tmpres[1] */
      for (i = 0; i < nr; i++)
      {
	  mpres_set_z_for_gcd (tmpres[0], R_x[i], modulus);
#define TEST_ZERO_RESULT
#ifdef TEST_ZERO_RESULT
	  if (mpres_is_zero (tmpres[0], modulus))
	      outputf (OUTPUT_VERBOSE, "R_[%lu] = 0\n", i);
#endif
	  mpres_mul (tmpres[1], tmpres[1], tmpres[0], modulus);
      }
      outputf (OUTPUT_VERBOSE, "Computing product of F(g_i)^(1) took %lums\n",
	       cputime () - timestart);
      if (test_verbose(OUTPUT_RESVERBOSE))
      {
	  mpres_get_z (mt, tmpres[1], modulus);
	  outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", mt);
      }

      mpres_gcd (mt, tmpres[1], modulus);
      if (mpz_cmp_ui (mt, 1UL) > 0)
	{
	  mpz_set (f, mt);
	  youpi = ECM_FACTOR_FOUND_STEP2;
	  break;
	}
    }

  mpz_clear (mt);
  mpres_clear (b1_x, modulus);
  mpres_clear (b1_y, modulus);
  mpres_clear (Delta, modulus);
  for (i = 0; i < tmpreslen; i++)
      mpres_clear (tmpres[i], modulus);
  clear_list (fh_x, lenF);
  clear_list (fh_y, lenF);
  free (h_x);
  free (h_y);
  clear_list (g_x, lenG);
  clear_list (g_y, lenG);
  clear_list (R_x, lenR);
  clear_list (R_y, lenR);
  clear_list (tmp, tmplen);
  free (S_2);

  outputf (OUTPUT_NORMAL, "Step 2");
  /* In normal output mode, print only cpu time as we always have.
     In verbose mode, print real time as well if we used multi-threading */
  if (test_verbose (OUTPUT_VERBOSE))
    print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
  else
    print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);

  return youpi;
}


int
pp1fs2_ntt (mpz_t f, const mpres_t X, mpmod_t modulus,
	    const faststage2_param_t *params, const int twopass)
{
  unsigned long nr;
  unsigned long l, lenF;
  sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
                       progressions of prime length */
  set_long_t *S_2; /* This is stored as a regular set */
  listz_t F;   /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
		  degree s_1. It is symmetric, so has only s_1 / 2 + 1
		  distinct coefficients. The sequence h_j will be stored in
		  the same memory and won't be a monic polynomial, so the
		  leading 1 monomial of F will be stored explicitly. Hence we
		  need s_1 / 2 + 1 entries. */
  listz_t R = NULL;  /* Is used only for two-pass convolution, has nr
			entries. R is only ever referenced if twopass == 1,
			but gcc does not realize that and complains about
			uninitialized value, so we set it to NULL. */
  mpzspm_t ntt_context;
  mpzspv_t g_x_ntt, g_y_ntt, h_x_ntt, h_y_ntt;
  mpres_t b1_x, b1_y, Delta;
  mpz_t mt;   /* All-purpose temp mpz_t */
  mpz_t product;
  mpz_t *product_ptr = NULL;
  int youpi = ECM_NO_FACTOR_FOUND;
  long timetotalstart, realtotalstart, timestart, realstart;

  timetotalstart = cputime ();
  realtotalstart = realtime ();

  ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
  ASSERT_ALWAYS (params->s_1 < params->l);
  nr = params->l - params->s_1; /* Number of points we evaluate */

  if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
      return ECM_ERROR;

  mpz_init (mt);

  /* Prepare NTT for computing the h sequence, its DCT-I, and the convolution
     with g. We need NTT of transform length l here. If we want to add
     transformed vectors, we need to double the modulus. */

  if (twopass)
    mpz_set (mt, modulus->orig_modulus);
  else
    mpz_mul_2exp (mt, modulus->orig_modulus, 1UL);

  ntt_context = mpzspm_init (params->l, mt);

  if (ntt_context == NULL)
    {
      outputf (OUTPUT_ERROR, "Could not initialise ntt_context, "
               "presumably out of memory\n");
      mpz_clear (mt);
      free (S_1);
      S_1 = NULL;
      free (S_2);
      S_2 = NULL;
      return ECM_ERROR;
    }

  print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for evaluation = ",
		    ntt_context);

  /* Allocate memory for F with correct amount of space for each mpz_t */
  lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
				     the leading 1 monomial for each factor */
  {
    /* in some cases the above value of lenF is not enough, for example with
       s_1 = 10, which gives lenF = 7, but 9 entries are needed */
    unsigned long mem = mem_poly_from_sets_V (S_1);
    if (mem > lenF)
      lenF = mem;
  }

  F = init_list2 (lenF, (unsigned int) abs (modulus->bits) + GMP_NUMB_BITS);

  /* Build F */
  if (build_F_ntt (F, X, S_1, params, modulus) == ECM_ERROR)
    {
      free (S_1);
      free (S_2);
      mpz_clear (mt);
      mpzspm_clear (ntt_context);
      clear_list (F, lenF);
      return ECM_ERROR;
    }

  free (S_1);
  S_1 = NULL;

  mpres_init (b1_x, modulus);
  mpres_init (b1_y, modulus);
  mpres_init (Delta, modulus);

  /* Compute Delta and b1_x + b1_y * sqrt(Delta) = X) */
  mpres_sqr (Delta, X, modulus);
  mpres_sub_ui (Delta, Delta, 4UL, modulus);
  mpres_div_2exp (b1_x, X, 1, modulus);
  mpres_set_ui (b1_y, 1UL, modulus);
  mpres_div_2exp (b1_y, b1_y, 1, modulus);
  if (test_verbose (OUTPUT_TRACE))
    {
      mpres_get_z (mt, Delta, modulus);
      outputf (OUTPUT_TRACE,
	       "Delta = Mod(%Zd, N); w = quadgen (4*lift(Delta)); b_1 = ", mt);
      gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
      outputf (OUTPUT_TRACE, "; /* PARI */\n");
      outputf (OUTPUT_TRACE, "X == b_1 + 1/b_1 /* PARI C */\n");
    }

  /* Allocate remaining memory for h_ntt */
  h_x_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
  h_y_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
  /* Compute the h_j sequence */
  pp1_sequence_h (NULL, NULL, h_x_ntt, h_y_ntt, F, b1_x, b1_y, 0L,
		  params->s_1 / 2 + 1, params->P, Delta, modulus,
		  ntt_context);
  /* We don't need F(x) any more */
  clear_list (F, lenF);

  /* compute the forward transform of h and store the distinct coefficients
     in h_ntt */
  g_x_ntt = mpzspv_init (params->l, ntt_context);
  if (twopass)
    {
      g_y_ntt = g_x_ntt;
      R = init_list2 (nr, (mpz_size (modulus->orig_modulus) + 2) *
                          GMP_NUMB_BITS);
    }
  else
    g_y_ntt = mpzspv_init (params->l, ntt_context);

  /* Compute DCT-I of h_x and h_y */
  outputf (OUTPUT_VERBOSE, "Computing DCT-I of h_x");
#ifdef _OPENMP
  outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
#endif
  timestart = cputime ();
  realstart = realtime ();
  mpzspv_to_dct1 (h_x_ntt, h_x_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
		  g_x_ntt, ntt_context);
  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);

  outputf (OUTPUT_VERBOSE, "Computing DCT-I of h_y");
#ifdef _OPENMP
  outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
#endif
  timestart = cputime ();
  realstart = realtime ();
  mpzspv_to_dct1 (h_y_ntt, h_y_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
		  g_x_ntt, ntt_context);
  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);

  if (test_verbose (OUTPUT_RESVERBOSE))
    {
      mpz_init (product);
      product_ptr = &product;
    }

  for (l = 0; l < params->s_2; l++)
    {
      const long M = params->l - 1 - params->s_1 / 2;

      outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
               l + 1, params->s_2);
      if (twopass)
	{
	  /* Two-pass variant. Two separate convolutions,
	     then addition in Z/NZ */
	  pp1_sequence_g (NULL, NULL, g_x_ntt, NULL, b1_x, b1_y, params->P,
			  Delta, M, params->l, params->m_1, S_2->elem[l],
			  modulus, ntt_context);

	  /* Do the convolution product of g_x * h_x */
	  outputf (OUTPUT_VERBOSE, "Computing g_x*h_x");
#ifdef _OPENMP
          outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
#endif
	  timestart = cputime ();
	  realstart = realtime ();
	  mpzspv_mul_by_dct (g_x_ntt, h_x_ntt, params->l, ntt_context,
	    NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
	  /* Store the product coefficients we want in R */
	  mpzspv_to_mpzv (g_x_ntt, params->s_1 / 2, R, nr, ntt_context);
	  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);

	  /* Compute g_y sequence */
	  pp1_sequence_g (NULL, NULL, NULL, g_y_ntt, b1_x, b1_y, params->P,
			  Delta, M, params->l, params->m_1, S_2->elem[l],
			  modulus, ntt_context);

	  /* Do the convolution product of g_y * (Delta * h_y) */
	  outputf (OUTPUT_VERBOSE, "Computing g_y*h_y");
#ifdef _OPENMP
          outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
#endif
	  timestart = cputime ();
	  realstart = realtime ();
	  mpzspv_mul_by_dct (g_y_ntt, h_y_ntt, params->l, ntt_context,
	    NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
	  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);

	  /* Compute product of sum of coefficients and gcd with N */
	  ntt_gcd (mt, product_ptr, g_y_ntt, params->s_1 / 2, R, nr,
		   ntt_context, modulus);
	}
      else
	{
	  /* One-pass variant. Two forward transforms and point-wise products,
	     then addition and single inverse transform */
	  pp1_sequence_g (NULL, NULL, g_x_ntt, g_y_ntt, b1_x, b1_y, params->P,
			  Delta, M, params->l, params->m_1, S_2->elem[l],
			  modulus, ntt_context);

	  outputf (OUTPUT_VERBOSE, "Computing forward NTT of g_x");
#ifdef _OPENMP
          outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
#endif
	  timestart = cputime ();
	  realstart = realtime ();
	  mpzspv_mul_by_dct (g_x_ntt, h_x_ntt, params->l, ntt_context,
	    NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL);
	  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);

	  outputf (OUTPUT_VERBOSE, "Computing forward NTT of g_y");
#ifdef _OPENMP
          outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
#endif
	  timestart = cputime ();
	  realstart = realtime ();
	  mpzspv_mul_by_dct (g_y_ntt, h_y_ntt, params->l, ntt_context,
	    NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL);
	  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);

	  outputf (OUTPUT_VERBOSE, "Adding and computing inverse NTT of sum");
#ifdef _OPENMP
          outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
#endif
	  timestart = cputime ();
	  realstart = realtime ();
	  mpzspv_add (g_x_ntt, (spv_size_t) 0, g_x_ntt, (spv_size_t) 0,
	              g_y_ntt, (spv_size_t) 0, params->l, ntt_context);
	  mpzspv_mul_by_dct (g_x_ntt, NULL, params->l, ntt_context,
	    NTT_MUL_STEP_IFFT);
	  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);

	  ntt_gcd (mt, product_ptr, g_x_ntt, params->s_1 / 2, NULL, nr,
		   ntt_context, modulus);
	}

      outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", product);

      if (mpz_cmp_ui (mt, 1UL) > 0)
	{
	  mpz_set (f, mt);
	  youpi = ECM_FACTOR_FOUND_STEP2;
	  break;
	}
    }

  if (test_verbose (OUTPUT_RESVERBOSE))
    {
      product_ptr = NULL;
      mpz_clear (product);
    }
  mpzspv_clear (g_x_ntt, ntt_context);
  if (twopass)
    clear_list (R, nr);
  else
    mpzspv_clear (g_y_ntt, ntt_context);
  mpzspv_clear (h_x_ntt, ntt_context);
  mpzspv_clear (h_y_ntt, ntt_context);
  mpzspm_clear (ntt_context);
  mpz_clear (mt);
  mpres_clear (b1_x, modulus);
  mpres_clear (b1_y, modulus);
  mpres_clear (Delta, modulus);
  free (S_2);

  outputf (OUTPUT_NORMAL, "Step 2");
  /* In normal output mode, print only cpu time as we always have.
     In verbose mode, print real time as well if we used multi-threading */
  if (test_verbose (OUTPUT_VERBOSE))
    print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
  else
    print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);

  return youpi;
}