1 /* Implementation of fast stage 2 for P-1 and P+1 as described in
2    "Improved Stage 2 to $P\pm{}1$ Factoring Algorithms" by
3    Peter L. Montgomery and Alexander Kruppa, ANTS 2008 (8th Algorithmic
4    Number Theory Symposium).
5 
6 Copyright 2007, 2008, 2009, 2010, 2011, 2012, 2016
7 Alexander Kruppa, Paul Zimmermann, David Cleaver
8 NTT functions are based on code Copyright 2005 Dave Newman.
9 
10 This file is part of the ECM Library.
11 
12 The ECM Library is free software; you can redistribute it and/or modify
13 it under the terms of the GNU Lesser General Public License as published by
14 the Free Software Foundation; either version 3 of the License, or (at your
15 option) any later version.
16 
17 The ECM Library is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
19 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
20 License for more details.
21 
22 You should have received a copy of the GNU Lesser General Public License
23 along with the ECM Library; see the file COPYING.LIB.  If not, see
24 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
25 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
26 
27 #include "config.h"
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <limits.h>
31 #include "ecm-impl.h"
32 #include "sp.h"
33 #include <math.h>
34 #ifdef HAVE_ALLOCA_H
35 #include <alloca.h>
36 #endif
37 #ifdef HAVE_STRING_H
38 #include <string.h>
39 #endif
40 #ifdef _OPENMP
41 #include <omp.h>
42 #endif
43 
44 /* TODO:
45    - move functions into their proper files (i.e. NTT functions etc.)
46    - later: allow storing NTT vectors on disk
47 */
48 
49 /* Define TEST_ZERO_RESULT to test if any result of the multipoint
50    evaluation is equal to zero. If the modulus is composite, this
51    happening might indicate a problem in the evalutaion code */
52 #define TEST_ZERO_RESULT
53 
54 const int pari = 0;
55 
56 const unsigned long Pvalues[] = {
57     3UL, 5UL, 9UL, 15UL, 21UL, 17UL, 27UL, 33UL, 45UL, 51UL, 63UL, 75UL,
58     105UL, 99UL, 135UL, 165UL, 195UL, 189UL, 231UL, 255UL, 315UL, 345UL,
59     357UL, 375UL, 405UL, 435UL, 525UL, 585UL, 615UL, 735UL, 765UL, 825UL,
60     945UL, 1155UL, 1065UL, 1365UL, 1305UL, 1335UL, 1575UL, 1785UL, 1995UL,
61     2145UL, 2205UL, 2415UL, 2625UL, 2805UL, 3045UL, 3465UL, 3675UL, 4095UL,
62     4305UL, 4515UL, 4725UL, 4785UL, 5355UL, 5775UL, 5985UL, 5865UL, 6825UL,
63     7245UL, 8085UL, 8925UL, 9555UL, 10395UL, 10725UL, 11025UL, 12285UL,
64     12705UL, 15015UL, 14175UL, 15225UL, 16065UL, 17325UL, 19635UL, 21945UL,
65     23205UL, 24255UL, 25935UL, 26775UL, 28875UL, 31395UL, 33495UL, 35805UL,
66     36465UL, 38115UL, 39585UL, 40425UL, 45045UL, 45885UL, 49665UL, 51765UL,
67     58905UL, 65835UL, 69615UL, 75075UL, 77805UL, 82005UL, 84315UL, 86625UL,
68     88935UL, 94185UL, 98175UL, 105105UL, 109725UL, 116025UL, 118755UL,
69     121275UL, 135135UL, 137445UL, 137655UL, 144375UL, 153615UL, 165165UL,
70     167475UL, 176715UL, 179025UL, 185955UL, 197505UL, 208845UL, 215985UL,
71     225225UL, 255255UL, 250635UL, 285285UL, 277095UL, 294525UL, 315315UL,
72     345345UL, 373065UL, 368445UL, 405405UL, 435435UL, 451605UL, 465465UL,
73     454545UL, 504735UL, 525525UL, 555555UL, 569415UL, 596505UL, 645645UL,
74     647955UL, 672945UL, 687225UL, 765765UL, 770385UL, 805035UL, 855855UL,
75     858585UL, 915915UL, 945945UL, 962115UL, 1036035UL, 1066065UL, 1119195UL,
76     1156155UL, 1276275UL, 1306305UL, 1354815UL, 1426425UL, 1456455UL,
77     1514205UL, 1576575UL, 1666665UL, 1726725UL, 1786785UL, 1789515UL,
78     1865325UL, 1996995UL, 1983135UL, 2177175UL, 2297295UL, 2327325UL,
79     2417415UL, 2567565UL, 2611455UL, 2807805UL, 2847075UL, 2878785UL,
80     3048045UL, 3161235UL, 3258255UL, 3357585UL, 3401475UL, 3533145UL,
81     3828825UL, 3918915UL, 3985905UL, 4279275UL, 4849845UL, 4789785UL,
82     4967655UL, 5180175UL, 5360355UL, 5870865UL, 5990985UL, 6561555UL,
83     6531525UL, 6891885UL, 7402395UL, 7912905UL, 8273265UL, 8580495UL,
84     8843835UL, 9444435UL, 10015005UL, 10465455UL, 10705695UL, 10885875UL,
85     11696685UL, 12267255UL, 12507495UL, 12785955UL, 13498485UL, 14549535UL,
86     14849835UL, 15570555UL, 16111095UL, 16291275UL, 17612595UL, 18123105UL,
87     18633615UL, 19684665UL, 20255235UL, 20825805UL, 22207185UL, 22717695UL,
88     24249225UL, 24819795UL, 25741485UL, 26531505UL, 28333305UL, 29354325UL,
89     30045015UL, 31396365UL, 32807775UL, 33948915UL, 33528495UL, 34879845UL,
90     37011975UL, 37522485UL, 39564525UL, 41096055UL, 43648605UL, 44219175UL,
91     45930885UL, 47222175UL, 48333285UL, 50075025UL, 51816765UL, 52777725UL,
92     55390335UL, 55547415UL, 59053995UL, 60063465UL, 61906845UL, 64579515UL,
93     66621555UL, 67492425UL, 70105035UL, 73258185UL, 74939865UL, 77224455UL,
94     79594515UL, 81876795UL, 84999915UL, 88062975UL, 91005915UL, 94189095UL,
95     98423325UL, 101846745UL, 111546435UL, 111035925UL, 115120005UL,
96     121246125UL, 124098975UL, 130945815UL, 140645505UL, 150345195UL,
97     150225075UL, 155450295UL, 158333175UL, 170255085UL, 179444265UL,
98     190285095UL, 198843645UL, 203408205UL, 206831625UL, 217222005UL,
99     229474245UL, 240705465UL, 252447195UL, 254999745UL, 269023755UL,
100     282146865UL, 287672385UL, 294076965UL, 306110805UL, 318302985UL,
101     334639305UL, 344338995UL, 354038685UL, 363738375UL, 373438065UL,
102     387221835UL, 400254855UL, 421936515UL, 431636205UL, 451035585UL,
103     453888435UL, 470434965UL, 480134655UL, 510765255UL, 522506985UL,
104     557732175UL, 570855285UL, 596530935UL, 610224615UL, 627912285UL,
105     654729075UL, 703227525UL, 722116395UL, 751725975UL, 780825045UL,
106     790524735UL, 821665845UL, 851275425UL, 863017155UL, 909984075UL,
107     936020085UL, 984518535UL, 1017041025UL, 1052416365UL
108 #if (ULONG_MAX > 4294967295)
109    ,1086110025UL, 1110614505UL, 1147371225UL, 1191785595UL, 1213887675UL,
110     1265809545UL, 1282356075UL, 1331995665UL, 1391905515UL, 1450103655UL,
111     1479202725UL, 1547100555UL, 1555088535UL, 1673196525UL, 1712565855UL,
112     1767130365UL, 1830673845UL, 1883166285UL, 1954487535UL, 2001964965UL,
113     2119382265UL, 2187280095UL, 2255177925UL, 2342475135UL, 2390973585UL,
114     2421213795UL, 2555868315UL, 2672264595UL, 2788660875UL, 2856558705UL,
115     2953555605UL, 3050552505UL, 3234846615UL, 3457939485UL, 3516137625UL,
116     3681032355UL, 3758629875UL, 3904125225UL, 4127218095UL, 4360010655UL,
117     4573403835UL, 4796496705UL, 4844995155UL, 5019589575UL, 5203883685UL,
118     5262081825UL, 5465775315UL, 5766465705UL, 5898837945UL, 6164152995UL,
119     6358146795UL, 6411780375UL, 6804332535UL, 6980458485UL, 7172920755UL,
120     7473611145UL, 7716103395UL, 7968295335UL, 8182259085UL, 8342499165UL,
121     8812168365UL, 9023519505UL, 9704539845UL, 9927632715UL, 10373818455UL,
122     10439434005UL, 10820004195UL, 11043097065UL, 11489282805UL,
123     11877270405UL, 12381654285UL, 12604747155UL, 13080031965UL,
124     13274025765UL, 13642613985UL, 14389490115UL, 14583483915UL,
125     15058768725UL, 15611651055UL, 16174233075UL, 16397325945UL,
126     17289697425UL, 17735883165UL, 18143270145UL, 18381678315UL,
127     19074440385UL, 19559424885UL, 20636090475UL, 20941375455UL,
128     21800053275UL, 22643926305UL, 23148310185UL, 24205576395UL,
129     24546777255UL, 25544133615UL, 26389538175UL, 26863291455UL,
130     27813861075UL, 29113619535UL, 29494189725UL, 30520074585UL,
131     30684969315UL, 31790733975UL, 33575476935UL, 34467848415UL,
132     35202742575UL, 36427185795UL, 38037334335UL, 39240095895UL,
133     40365259935UL, 42053005995UL, 43168470345UL, 44953213305UL,
134     45845584785UL, 48522699225UL, 50307442185UL, 51869092275UL,
135     53653835235UL, 54546206715UL, 56680138515UL, 58784971245UL,
136     59386352025UL, 61908271425UL, 63431122755UL, 65700850215UL,
137     67931778915UL, 70162707615UL, 72616729185UL, 74120181135UL,
138     75740029365UL, 78417143805UL, 80871165375UL, 82840202445UL,
139     86448487125UL, 88466022645UL, 91133437395UL, 92918180355UL,
140     100280245065UL, 100726430805UL, 102811864155UL, 106749938295UL,
141     109000266375UL, 113219631525UL, 119689324755UL, 121027881975UL,
142     127943760945UL, 132628711215UL, 134859639915UL, 141775518885UL,
143     148691397855UL, 150922326555UL, 155607276825UL, 161320394235UL,
144     164977177365UL, 171446870595UL, 177470378085UL, 183270792705UL
145 #endif
146 };
147 
148 /* All the prime factors that can appear in eulerphi(P) */
149 const unsigned long phiPfactors[] = {2UL, 3UL, 5UL, 7UL, 11UL, 13UL,
150 				     17UL, 19UL};
151 
152 
153 /* Some useful PARI functions:
154    sumset(a,b) = {local(i, j, l); l = listcreate (length(a) * length(b)); for (i = 1, length(a), for (j = 1, length(b), listput(l, a[i] + b[j]))); listsort (l, 1); l}
155 
156    V(i,X) = { if (i==0, return(2)); if (i==1, return(X)); if(i%2 == 0, return (V (i/2, X)^2-2)); return (V ((i+1)/2, X) * V ((i-1)/2, X) - X)}
157 
158    U(i,X) = { if (i==0, return(0)); if (i==1, return(1)); if(i%2 == 0, return (U (i/2, X) * V(i/2,X))); return (V ((i+1)/2, X)  *U( (i-1)/2, X) + 1)}
159 */
160 
161 #ifndef _OPENMP
omp_get_num_threads()162 static int omp_get_num_threads () {return 1;}
omp_get_thread_num()163 static int omp_get_thread_num () {return 0;}
164 #endif
165 
166 static void
167 ntt_sqr_reciprocal (mpzv_t, const mpzv_t, mpzspv_t, const spv_size_t,
168 		    const mpzspm_t);
169 
170 static void
print_elapsed_time(int verbosity,long cpu_start,ATTRIBUTE_UNUSED long real_start)171 print_elapsed_time (int verbosity, long cpu_start,
172 		    ATTRIBUTE_UNUSED long real_start)
173 {
174 #ifdef _OPENMP
175   if (real_start != 0L)
176     {
177       outputf (verbosity, " took %lums (%lums real)\n",
178 	       elltime (cpu_start, cputime()),
179 	       elltime (real_start, realtime()));
180       return;
181     }
182 #endif
183   outputf (verbosity, " took %lums\n", elltime (cpu_start, cputime()));
184 }
185 
186 
187 static void
print_CRT_primes(const int verbosity,const char * prefix,const mpzspm_t ntt_context)188 print_CRT_primes (const int verbosity, const char *prefix,
189 		   const mpzspm_t ntt_context)
190 {
191   double modbits = 0.;
192   unsigned int i;
193 
194   if (test_verbose (verbosity))
195     {
196       outputf (verbosity, "%s%lu", prefix, ntt_context->spm[0]->sp);
197       modbits += log ((double) ntt_context->spm[0]->sp);
198       for (i = 1; i < ntt_context->sp_num; i++)
199 	{
200 	  outputf (verbosity, " * %lu", ntt_context->spm[i]->sp);
201 	  modbits += log ((double) ntt_context->spm[i]->sp);
202 	}
203       outputf (verbosity, ", has %d primes, %f bits\n",
204                ntt_context->sp_num, modbits / log (2.));
205     }
206 }
207 
208 /* Approximate amount of memory in bytes each coefficient in an NTT takes
209    so that NTT can do transforms up to length lmax with modulus, or
210    with 2*modulus if twice != 0 */
211 static size_t
ntt_coeff_mem(const unsigned long lmax,const mpz_t modulus,const int twice)212 ntt_coeff_mem (const unsigned long lmax, const mpz_t modulus, const int twice)
213 {
214   mpz_t t;
215   size_t n;
216 
217   mpz_init (t);
218   mpz_mul (t, modulus, modulus);
219   mpz_mul_ui (t, t, lmax);
220   if (twice)
221     mpz_mul_2exp (t, t, 1UL);
222   /* +4: +1 for rounding up, +3 for extra words due to ECRT */
223   n = (mpz_sizeinbase (t, 2) - 1) / SP_NUMB_BITS + 4;
224   mpz_clear (t);
225   return n * sizeof (sp_t);
226 }
227 
228 size_t
pm1fs2_memory_use(const unsigned long lmax,const mpz_t modulus,const int use_ntt)229 pm1fs2_memory_use (const unsigned long lmax, const mpz_t modulus,
230 		   const int use_ntt)
231 {
232   if (use_ntt)
233     {
234       /* We store lmax / 2 + 1 coefficients for the DCT-I of F and lmax
235 	 coefficients for G in NTT ready format. Each coefficient in
236 	 NTT-ready format occupies approx.
237 	 ceil(log(lmax*modulus^2)/log(bits per sp_t)) + 3 words. */
238 
239       size_t n;
240 
241       n = ntt_coeff_mem (lmax, modulus, 0) * (size_t) (3 * lmax / 2 + 1);
242       outputf (OUTPUT_DEVVERBOSE, "pm1fs2_memory_use: Estimated memory use "
243 	       "with lmax = %lu NTT is %lu bytes\n", lmax, n);
244       return n;
245     }
246   else
247     {
248       /* F stores s_1/2 residues,
249 	 h stores s_1 mpz_t structs (residues get cloned from F)
250 	 g stores lmax residues,
251 	 R stores lmax-s_1 residues,
252 	 and tmp stores 3*lmax+list_mul_mem (lmax / 2) residues.
253 	 Assume s_1 is close to lmax/2.
254 	 Then we have
255 	 lmax/4 + lmax/2 + lmax + lmax/2 + 3*lmax + list_mul_mem (lmax / 2)
256 	 = (5+1/4)*lmax + list_mul_mem (lmax / 2) residues, plus s_1 mpz_t.
257       */
258 
259       size_t n;
260 
261       n = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
262       n *= 5 * lmax + lmax / 4 + list_mul_mem (lmax / 2);
263       n += lmax / 2 * sizeof (mpz_t);
264       /* Memory use due to temp space allocation in TMulKS appears to
265 	 approximately triple the estimated memory use. This is hard to
266 	 estimate precisely, so let's go with the fudge factor of 3 here */
267       n *= 3;
268       outputf (OUTPUT_DEVVERBOSE, "pm1fs2_memory_use: Estimated memory use "
269 	       "with lmax = %lu is %lu bytes\n", lmax, n);
270       return n;
271     }
272 }
273 
274 /* return the possible lmax for given memory use and modulus */
275 
276 unsigned long
pm1fs2_maxlen(const size_t memory,const mpz_t modulus,const int use_ntt)277 pm1fs2_maxlen (const size_t memory, const mpz_t modulus, const int use_ntt)
278 {
279   if (use_ntt)
280     {
281       size_t n, lmax = 1;
282 
283       n = ntt_coeff_mem (lmax, modulus, 0);
284       lmax = 1UL << ceil_log2 (memory / n / 3);
285       return lmax;
286     }
287   else
288     {
289       size_t lmax, n;
290 
291       n = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
292 
293       /* Guess an initial value of lmax for list_mul_mem (lmax / 2) */
294       /* memory = n * 25/4 * lmax + lmax / 2 * sizeof (mpz_t); */
295       /* Fudge factor of 3 for TMulKS as above */
296       lmax = memory / (3 * 25 * n / 4 + 3 * sizeof (mpz_t) / 2);
297       return lmax;
298     }
299 }
300 
301 size_t
pp1fs2_memory_use(const unsigned long lmax,const mpz_t modulus,const int use_ntt,const int twopass)302 pp1fs2_memory_use (const unsigned long lmax, const mpz_t modulus,
303 		   const int use_ntt, const int twopass)
304 {
305   size_t n, m;
306 
307   m = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
308   if (use_ntt)
309     {
310       /* In one pass mode, we store h_x_ntt and h_y_ntt, each of length
311 	 lmax/2(+1), and g_x_ntt and g_y_ntt, each of length lmax, all in
312 	 NTT ready format. In two pass mode, we store h_x_ntt, h_y_ntt and
313 	 g_x_ntt as before, plus R which is lmax - s_1 mpz_t.
314 	 We assume s_1 ~= lmax/2.
315       */
316 
317       n = ntt_coeff_mem (lmax, modulus, !twopass);
318       if (twopass)
319 	return lmax * (2 * n + m / 2);
320       else
321 	return lmax * 3 * n;
322     }
323   else
324     {
325       /* We allocate:
326 	 F: s_1/2 coefficients
327 	 fh_x, fh_y: s_1/2 coefficients
328 	 h_x, h_y: s_1 mpz_t's (cloned from fh_x and fh_y)
329 	 g_x, g_y: lmax coefficients
330 	 R_x, R_y: lmax - s_1 coefficients
331 	 tmp: 3UL * lmax + list_mul_mem (lmax / 2)
332 	 Assuming s_1 ~ lmax/2, that's
333 	 lmax/2 + 2*lmax/4 + 2*lmax + 2*lmax/2 * 3*lmax +
334            list_mul_mem (lmax / 2) =
335 	 7 + list_mul_mem (lmax / 2) coefficients and lmax mpz_t.
336        */
337 
338       n = m * (7 * lmax + list_mul_mem (lmax / 2));
339       n += lmax * sizeof (mpz_t);
340       n = 5 * n / 2; /* A fudge factor again */
341       return n;
342     }
343 }
344 
345 unsigned long
pp1fs2_maxlen(const size_t memory,const mpz_t modulus,const int use_ntt,const int twopass)346 pp1fs2_maxlen (const size_t memory, const mpz_t modulus, const int use_ntt,
347 	       const int twopass)
348 {
349   size_t n, m;
350 
351   m = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
352   if (use_ntt)
353     {
354       n = ntt_coeff_mem (1, modulus, !twopass);
355       if (twopass)
356 	n = memory / (2 * n + m / 2);
357       else
358 	n = memory / (3 * n);
359       return 1UL << (ceil_log2 (n / 2)); /* Rounded down to power of 2 */
360     }
361   else
362     {
363       return memory / 5 / (m * 8 + sizeof (mpz_t)) * 2;
364     }
365 }
366 
367 
368 /* Test if for given P, nr, B2min and B2 we can choose an m_1 so that the
369    stage 2 interval [B2min, B2] is covered. The effective B2min and B2
370    are stored in effB2min and effB2 */
371 
372 static int
test_P(const mpz_t B2min,const mpz_t B2,mpz_t m_1,const unsigned long P,const unsigned long nr,mpz_t effB2min,mpz_t effB2)373 test_P (const mpz_t B2min, const mpz_t B2, mpz_t m_1, const unsigned long P,
374 	const unsigned long nr, mpz_t effB2min, mpz_t effB2)
375 {
376   mpz_t m;
377   /* We need B2min >= 2 * max(S_1 + S_2) + (2*m_1 - 1)*P + 1, or
378      B2min - 2 * max(S_1 + S_2) - 1 >= (2*m_1)*P - P, or
379      (B2min - 2*max(S_1 + S_2) + P - 1)/(2P) >= m_1
380      Choose m_1 accordingly */
381 
382   mpz_init (m);
383   sets_max (m, P);
384   mpz_mul_2exp (m, m, 1UL); /* m = 2*max(S_1 + S_2) */
385 
386   mpz_sub (m_1, B2min, m);
387   mpz_sub_ui (m_1, m_1, 1UL); /* m_1 = B2min - 2*max(S_1 + S_2) - 1 */
388   mpz_add_ui (m_1, m_1, P);
389   mpz_fdiv_q_2exp (m_1, m_1, 1UL);
390   mpz_fdiv_q_ui (m_1, m_1, P);    /* 2UL*P may overflow */
391 
392   /* Compute effB2min = 2 * max(S_1 + S_2) + (2*(m_1 - 1) + 1)*P + 1 */
393 
394   mpz_mul_2exp (effB2min, m_1, 1UL);
395   mpz_sub_ui (effB2min, effB2min, 1UL);
396   mpz_mul_ui (effB2min, effB2min, P);
397   mpz_add (effB2min, effB2min, m);
398   mpz_add_ui (effB2min, effB2min, 1UL);
399   ASSERT_ALWAYS (mpz_cmp (effB2min, B2min) <= 0);
400 
401   /* Compute the smallest value coprime to P at the high end of the stage 2
402      interval that will not be covered:
403      2*(min(S_1 + S_2)) + (2*(m_1 + nr) + 1)*P.
404      We assume min(S_1 + S_2) = -max(S_1 + S_2) */
405   mpz_add_ui (effB2, m_1, nr);
406   mpz_mul_2exp (effB2, effB2, 1UL);
407   mpz_add_ui (effB2, effB2, 1UL);
408   mpz_mul_ui (effB2, effB2, P);
409   mpz_sub (effB2, effB2, m);
410 
411   /* The effective B2 values is that value, minus 1 */
412   mpz_sub_ui (effB2, effB2, 1UL);
413 
414   mpz_clear (m);
415   return (mpz_cmp (B2, effB2) <= 0);
416 }
417 
418 
419 static void
factor_phiP(int * exponents,const unsigned long phiP)420 factor_phiP (int *exponents, const unsigned long phiP)
421 {
422     const int nrprimes = sizeof (phiPfactors) / sizeof (unsigned long);
423     unsigned long cofactor = phiP;
424     int i;
425 
426     ASSERT_ALWAYS (phiP > 0UL);
427 
428     for (i = 0; i < nrprimes; i++)
429 	for (exponents[i] = 0; cofactor % phiPfactors[i] == 0UL; exponents[i]++)
430 	    cofactor /= phiPfactors[i];
431 
432     ASSERT_ALWAYS (cofactor == 1UL);
433 }
434 
435 
436 static unsigned long
pow_ul(const unsigned long b,const unsigned int e)437 pow_ul (const unsigned long b, const unsigned int e)
438 {
439     unsigned long r = 1UL;
440     unsigned int i;
441 
442     for (i = 0; i < e; i++)
443 	r *= b;
444 
445     return r;
446 }
447 
448 static unsigned long
absdiff_ul(unsigned long a,unsigned long b)449 absdiff_ul (unsigned long a, unsigned long b)
450 {
451     return (a > b) ? a - b : b - a;
452 }
453 
454 /* Choose s_1 so that s_1 * s_2 = phiP, s_1 is positive and even,
455    s_2 >= min_s2 and s_2 is minimal and abs(s_1 - l) is minimal
456    under those conditions. If use_ntt == 1, we require s_1 < l.
457    Returns 0 if no such choice is possible */
458 
459 static unsigned long
choose_s_1(const unsigned long phiP,const unsigned long min_s2,const unsigned long l,const int use_ntt)460 choose_s_1 (const unsigned long phiP, const unsigned long min_s2,
461 	    const unsigned long l, const int use_ntt)
462 {
463   const int nrprimes = sizeof (phiPfactors) / sizeof (unsigned long);
464   /* Using [nrprimes] here makes the compiler complain about variable-sized
465      arrays */
466   int phiPexponents[sizeof (phiPfactors) / sizeof (unsigned long)],
467     exponents[sizeof (phiPfactors) / sizeof (unsigned long)];
468   unsigned long s_1 = 0UL, s_2 = 0UL, trys_1;
469   int i;
470 
471   ASSERT_ALWAYS (phiP > 0 && phiP % 2 == 0);
472 
473   /* We want only even s_1. We divide one 2 out of phiP here... */
474   factor_phiP (phiPexponents, phiP / 2);
475   for (i = 0; i < nrprimes; i++)
476       exponents[i] = 0;
477 
478   do {
479       trys_1 = 2; /* ... and add a 2 here */
480       for (i = 0; i < nrprimes; i++)
481 	  trys_1 *= pow_ul (phiPfactors[i], exponents[i]);
482 #if 0
483       printf ("choose_s_1: Trying trys_1 = %lu\n", trys_1);
484 #endif
485       /* See if it satisfies all the required conditions and is an
486 	 improvement over the previous choice */
487       if (phiP / trys_1 >= min_s2 &&
488 	  (s_2 == 0UL || phiP / trys_1 < s_2) &&
489 	  absdiff_ul (trys_1, l) < absdiff_ul (s_1, l) &&
490 	  (use_ntt == 0 || trys_1 < l))
491       {
492 #if 0
493 	  printf ("choose_s_1: New best s_1 for phiP = %lu, min_s2 = %lu, "
494 		  "l = %lu : %lu\n", phiP, min_s2, l, trys_1);
495 #endif
496 	  s_1 = trys_1;
497       }
498       for (i = 0; i < nrprimes; i++)
499       {
500 	  if (++(exponents[i]) <= phiPexponents[i])
501 	      break;
502 	  exponents[i] = 0;
503       }
504   } while (i < nrprimes);
505 
506   return s_1;
507 }
508 
509 
510 /* Approximate cost of stage 2. Cost with and without ntt are not
511    comparable. We have l > s_1 and s_1 * s_2 = eulerphi(P), hence
512    s_2*l > eulerphi(P) and so cost (s_2, l) > eulerphi(P) for all P */
513 static unsigned long
est_cost(const unsigned long s_2,const unsigned long l,const int use_ntt,const int method)514 est_cost (const unsigned long s_2, const unsigned long l, const int use_ntt,
515           const int method)
516 {
517   if (method == ECM_PM1)
518     {
519       /* The time for building f, h and DCT-I of h seems to be about
520          7/6 of the time of computing g, h*g and gcd with NTT, and
521          3/2 of the time of computing g, h*g and gcd without NTT */
522 
523       if (use_ntt)
524         return (7 * l) / 6 + s_2 * l;
525       else
526         return (3 * l) / 2 + s_2 * l;
527     }
528   else if (method == ECM_PP1)
529     {
530       /* Building f is the same, building h and its forward transform is
531          twice about as expensive as for P-1. Each multi-point evaluation
532          is twice as expensive as for P-1.
533          FIXME: The estimate for NTT assumes the "one-pass" variant, in
534          "two-pass" the multipoint evaluations are slower, so the optimum
535          shifts towards smaller s_2 some more */
536       if (use_ntt)
537         return (4 * l) / 5 + s_2 * l;
538       else
539         return (3 * l) / 4 + s_2 * l;
540     }
541   else
542     abort (); /* Invalid value for method */
543 }
544 
545 /* Choose P so that a stage 2 range from B2min to B2 can be covered with
546    multipoint evaluations, each using a convolution of length at most lmax.
547    The parameters for stage 2 are stored in finalparams, the final effective
548    B2min and B2 values in final_B2min and final_B2, respecively. Each of these
549    may be NULL, in which case the value is not stored. It is permissible
550    to let B2min and final_B2min, or B2 and final_B2 point at the same mpz_t. */
551 
552 long
choose_P(const mpz_t B2min,const mpz_t B2,const unsigned long lmax,const unsigned long min_s2,faststage2_param_t * finalparams,mpz_t final_B2min,mpz_t final_B2,const int use_ntt,const int method)553 choose_P (const mpz_t B2min, const mpz_t B2, const unsigned long lmax,
554 	  const unsigned long min_s2, faststage2_param_t *finalparams,
555 	  mpz_t final_B2min, mpz_t final_B2, const int use_ntt,
556 	  const int method)
557 {
558   /* Let S_1 + S_2 == (Z/PZ)* (mod P).
559 
560      Let F(x) = \prod_{k_1 \in S_1} (x - b_1^{2 k_1}).
561 
562      If we evaluate F(b_1^{2 k_2 + (2m + 1)P}) for all k_2 \in S_2 with
563      m_1 <= m < m_1+nr, we test all exponents 2 k_2 + (2m + 1)P - 2 k_1.
564      The largest value coprime to P at the low end of the stage 2 interval
565      *not* covered will be
566        2*max(S_2) + (2*(m_1-1) + 1)*P - 2*min(S_1).
567      The smallest value at the high end not covered will be
568        2*min(S_2) + (2*(m_1 + nr) + 1)*P - 2*max(S_1).
569      Assume S_1 and S_2 are symmetric around 0, so that max(S_1) = -min(S_1).
570      Then the largest ... is:
571        2*(max(S_1) + max(S_2)) + (2*m_1 - 1)*P
572      The smallest ... is:
573        -2*(max(S_1) + max(S_2)) + (2*m_1 + 2*nr + 1)*P
574      The effective B2min = 2*(max(S_1) + max(S_2)) + (2*m_1 - 1)*P + 1
575      The effective B2max = -2*(max(S_1) + max(S_2)) + (2*m_1 + 2*nr + 1)*P - 1
576 
577      Then the difference effB2max - effB2min =
578        -4*(max(S_1) + max(S_2)) + 2P*(nr + 1) - 2
579 
580      We obviously require B2max - B2min <= 2*nr*P
581      Since nr < lmax, B2max - B2min <= 2*lmax*P or
582      P >= ceil((B2max - B2min)/(2*lmax))
583 
584      Hence we are looking for an odd P with s_1 * s_2 = eulerphi(P) so that
585      s_1 ~= lmax / 2 and the whole stage 2 interval is covered. s_2 should
586      be small, as long as s_1 is small enough.
587   */
588 
589   mpz_t B2l, m_1, effB2min, tryeffB2, effB2, lmin;
590   /* The best parameters found so far, P == 0 means that no suitable P
591      has been found yet: */
592   unsigned long P = 0, s_1 = 0, s_2 = 0, l = 0, cost = 0;
593   unsigned int i;
594   const unsigned int Pvalues_len = sizeof (Pvalues) / sizeof (unsigned long);
595   int r;
596 
597   outputf (OUTPUT_TRACE,
598            "choose_P(B2min = %Zd, B2 = %Zd, lmax = %lu, min_s2 = %ld, "
599            "use_ntt = %d, method = %d\n",
600            B2min, B2, lmax, min_s2, use_ntt, method);
601 
602   if (mpz_cmp (B2, B2min) < 0)
603     return 0L;
604 
605   /* If we use the NTT, we allow only power-of-two transform lengths.
606      In that case, the code below assumes that lmax is a power of two.
607      If that is not the case, print error and return. */
608   if (use_ntt && (lmax & (lmax - 1UL)) != 0)
609     {
610       outputf (OUTPUT_ERROR,
611                "choose_P: Error, lmax = %lu is not a power of two\n", lmax);
612       return ECM_ERROR;
613     }
614 
615   mpz_init (effB2);
616   mpz_init (tryeffB2);
617   mpz_init (effB2min);
618   mpz_init (B2l);
619   mpz_init (m_1);
620   mpz_init (lmin);
621 
622   mpz_sub (B2l, B2, B2min);
623   mpz_add_ui (B2l, B2l, 1UL); /* +1 due to closed interval */
624 
625   /* For each candidate P, check if [B2min, B2] can be covered at all,
626      and if so, what the best parameters (minimizing the cost, maximizing
627      effB2) are. If they are better than the best parameters for the best P
628      so far, remember them. */
629 
630   for (i = 0 ; i < Pvalues_len; i++)
631     {
632       unsigned long tryP, tryphiP, trys_1, trys_2, tryl, trycost;
633 
634       tryP = Pvalues[i];
635       tryphiP = eulerphi (tryP);
636 
637       outputf (OUTPUT_TRACE,
638 	       "choose_P: trying P = %lu, eulerphi(P) = %lu\n", tryP, tryphiP);
639 
640       /* If we have a good P already and this tryphiP >= cost, then
641 	 there's no hope for this tryP, since cost(s_2, l) > eulerphi(P) */
642       if (P != 0 && tryphiP >= cost)
643 	{
644 	  outputf (OUTPUT_TRACE,
645 		   "choose_P: tryphiP > cost = %lu, this P is too large\n",
646 		   cost);
647 	  continue;
648 	}
649 
650       /* We have nr < l and effB2-effB2min <= 2*nr*P. Hence we need
651 	 l >= B2l/P/2 */
652       mpz_cdiv_q_ui (lmin, B2l, tryP);
653       mpz_cdiv_q_2exp (lmin, lmin, 1UL);
654       outputf (OUTPUT_TRACE, "choose_P: lmin = %Zd for P = %lu\n", lmin, tryP);
655       if (mpz_cmp_ui (lmin, lmax) > 0)
656 	{
657 	  outputf (OUTPUT_TRACE,
658 		   "choose_P: lmin > lmax, this P is too small\n");
659 	  continue;
660 	}
661 
662       /* Try all possible transform lengths and store parameters in
663 	 P, s_1, s_2, l if they are better than the previously best ones */
664 
665       /* Keep reducing tryl to find best parameters. For NTT, we only have
666 	 power of 2 lengths so far, so we can simply divide by 2.
667 	 For non-NTT, we have arbitrary transform lengths so we can decrease
668 	 in smaller steps... let's say by, umm, 25% each time? */
669       for (tryl = lmax; mpz_cmp_ui (lmin, tryl) <= 0;
670 	   tryl = (use_ntt) ? tryl / 2 : 3 * tryl / 4)
671 	{
672 	  trys_1 = choose_s_1 (tryphiP, min_s2, tryl / 2, use_ntt);
673 	  if (trys_1 == 0)
674 	    {
675 	      outputf (OUTPUT_TRACE,
676 		       "choose_P: could not choose s_1 for P = %lu, l = %lu\n",
677 		       tryP, tryl);
678 	      continue;
679 	    }
680 	  ASSERT (tryphiP % trys_1 == 0UL);
681 	  trys_2 = tryphiP / trys_1;
682 	  outputf (OUTPUT_TRACE, "choose_P: chose s_1 = %lu, k = s_2 = %lu "
683 		   "for P = %lu, l = %lu\n", trys_1, trys_2, tryP, tryl);
684 
685 	  if (test_P (B2min, B2, m_1, tryP, tryl - trys_1, effB2min, tryeffB2))
686 	    {
687 	      outputf (OUTPUT_TRACE,
688 		       "choose_P: P = %lu, l = %lu, s_1 = %lu, k = s_2 = %lu "
689 		       "works, m_1 = %Zd, effB2min = %Zd, effB2 = %zZd\n",
690 		       tryP, tryl, trys_1, trys_2, m_1, effB2min, tryeffB2);
691 
692           /* do not allow B2min to be less than 0 */
693           if (mpz_cmp_ui(effB2min, 0) < 0)
694             continue;
695 
696 	      /* We use these parameters if we
697 		 1. didn't have any suitable ones yet, or
698 		 2. these cover [B2min, B2] and are cheaper than the best
699                     ones so far, or
700 		 3. they are as expensive but reach greater effB2. */
701 	      trycost = est_cost (trys_2, tryl, use_ntt, method);
702 	      ASSERT (tryphiP < trycost);
703 	      if (P == 0 || trycost < cost ||
704 		  (trycost == cost && mpz_cmp (tryeffB2, effB2) > 0))
705 		{
706 		  outputf (OUTPUT_TRACE,
707 			   "choose_P: and is the new optimum (cost = %lu)\n",
708 			   trycost);
709 		  P = tryP;
710 		  s_1 = trys_1;
711 		  s_2 = trys_2;
712 		  l = tryl;
713 		  cost = trycost;
714 		  mpz_set (effB2, tryeffB2);
715 		}
716 	    }
717 	}
718   }
719 
720   if (P != 0) /* If we found a suitable P */
721     {
722       /* Compute m_1, effB2min, effB2 again */
723       r = test_P (B2min, B2, m_1, P, l - s_1, effB2min, effB2);
724       ASSERT_ALWAYS(r != 0);
725       if (finalparams != NULL)
726 	{
727 	  finalparams->P = P;
728 	  finalparams->s_1 = s_1;
729 	  finalparams->s_2 = s_2;
730 	  finalparams->l = l;
731 	  mpz_set (finalparams->m_1, m_1);
732 	}
733       if (final_B2min != NULL)
734 	mpz_set (final_B2min, effB2min);
735       if (final_B2 != NULL)
736 	mpz_set (final_B2, effB2);
737     }
738 
739   mpz_clear (effB2);
740   mpz_clear (tryeffB2);
741   mpz_clear (effB2min);
742   mpz_clear (B2l);
743   mpz_clear (m_1);
744   mpz_clear (lmin);
745 
746   return (P != 0) ? (long) P : ECM_ERROR;
747 }
748 
749 
750 
751 static void
list_output_poly(listz_t l,unsigned long len,int monic,int symmetric,char * prefix,char * suffix,int verbosity)752 list_output_poly (listz_t l, unsigned long len, int monic, int symmetric,
753 		  char *prefix, char *suffix, int verbosity)
754 {
755   unsigned long i;
756 
757   if (prefix != NULL)
758     outputf (verbosity, prefix);
759 
760   if (len == 0)
761     {
762       if (monic)
763 	outputf (verbosity, "1\n", len, len);
764       else
765 	outputf (verbosity, "0\n", len);
766       return;
767     }
768 
769   if (monic)
770     {
771       if (symmetric)
772 	outputf (verbosity, "(x^%lu + x^-%lu) + ", len, len);
773       else
774 	outputf (verbosity, "x^%lu + ", len);
775     }
776   for (i = len - 1; i > 0; i--)
777     if (symmetric)
778       outputf (verbosity, "%Zd * (x^%lu + x^-%lu) + ", l[i], i, i);
779     else
780       outputf (verbosity, "%Zd * x^%lu + ", l[i], i);
781   outputf (verbosity, "%Zd", l[0]);
782   if (suffix != NULL)
783     outputf (verbosity, suffix);
784 }
785 
786 #if 0
787 /* Multiply P[i] by r^{k(deg-i)}, for 0 <= i <= deg. Needs 3 entries in tmp. */
788 /* I.e., let P(x) = x^deg + \sum_{i=0}^{deg - 1} P[i] * x^i. The output is
789    R(x) = x^deg + \sum_{i=0}^{deg - 1} R[i] * x^i = r^(k deg) P(r^{-k} x). */
790 /* The input and output polynomials are monic and have the leading monomial
791    implicit, i.e. not actually stored in the array of coefficients. */
792 /* Returns 0 if a modular inversion failed (in which case R is left
793    unchanged), 1 otherwise */
794 
795 static int
796 list_scale_rev (listz_t R, listz_t S, mpz_t r, long k, unsigned long deg,
797 		mpz_t modulus, listz_t tmp,
798 		ATTRIBUTE_UNUSED const unsigned long tmplen)
799 {
800   unsigned long i;
801 
802   ASSERT (tmplen >= 3);
803   mpz_powm_ui (tmp[0], r, (unsigned long) labs (k), modulus);
804   if (k < 0)
805     {
806       if (!mpz_invert (tmp[0], tmp[0], modulus)) /* FIXME: get rid of this! */
807 	return 0;
808     }
809   /* Here, tmp[0] = r^k */
810   mpz_set (tmp[1], tmp[0]);
811   /* mpz_set (R[deg], S[deg]); Leading monomial is not stored! */
812   for (i = 1; i + 1 <= deg; i++)
813     {
814       /* Here, tmp[1] = r^(ki) */
815       mpz_mul (tmp[2], S[deg-i], tmp[1]);
816       mpz_mod (R[deg-i], tmp[2], modulus);
817       mpz_mul (tmp[2], tmp[1], tmp[0]);  /* FIXME, avoid unnecessary mul */
818       mpz_mod (tmp[1], tmp[2], modulus); /* at end of loop */
819     }
820   if (i <= deg)
821     {
822       mpz_mul (tmp[2], S[deg-i], tmp[1]);
823       mpz_mod (R[deg-i], tmp[2], modulus);
824     }
825 
826   return 1;
827 }
828 #endif
829 
830 /* Same, but does squaring which makes things easier */
831 
832 static void
list_sqr_reciprocal(listz_t R,listz_t S,const unsigned long l,mpz_t modulus,listz_t tmp,ATTRIBUTE_UNUSED const unsigned long tmplen)833 list_sqr_reciprocal (listz_t R, listz_t S, const unsigned long l,
834 		     mpz_t modulus, listz_t tmp,
835 		     ATTRIBUTE_UNUSED const unsigned long tmplen)
836 {
837   unsigned long i;
838   listz_t Srev, r1 = tmp, r2 = tmp + 2 * l - 1, t = tmp + 4 * l - 2;
839 
840   if (l == 0UL)
841     return;
842 
843   /* FIXME: This modifies the input arguments. */
844   /* We have to divide S[0] by 2 */
845 
846   ASSERT (tmplen >= 4 * l - 2 + list_mul_mem (l));
847 
848 #if 0
849   gmp_printf ("/* list_sqr_reciprocal */ S(x) = %Zd", S[0]);
850   for (i = 1; i < l1; i++)
851     gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S[i], i, i);
852   gmp_printf ("\n");
853 #endif
854 
855   if (mpz_odd_p (S[0]))
856     {
857       ASSERT_ALWAYS (mpz_odd_p (modulus));
858       mpz_add (S[0], S[0], modulus);
859     }
860   mpz_tdiv_q_2exp (S[0], S[0], 1UL);
861 
862   list_mul (r1, S, l, S, l, 0, t);
863   /* r1 = f0*g0/4 + (f0*g1 + f1*g0)/2 * x + f1*g1 * x^2 */
864 #if 0
865   for (i = 0; i < 2UL * l - 1UL; i++)
866     gmp_printf ("list_sqr_reciprocal: r1[%lu] = %Zd\n", i, r1[i]);
867 #endif
868 
869   Srev = (listz_t) malloc (l * sizeof (mpz_t));
870   ASSERT_ALWAYS (Srev != NULL);
871   for (i = 0UL; i < l; i++)
872       (*Srev)[i] = (*S)[l - 1UL - i];
873   list_mul (r2, S, l, Srev, l, 0, t);
874   /* r2 is symmetric, r2[i] = r2[2*l - 2 - i]. Check this */
875 #if 0
876   for (i = 0; 0 && i < 2UL * l - 1UL; i++)
877     gmp_printf ("list_sqr_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
878 #endif
879 #ifdef WANT_ASSERT
880   for (i = 0UL; i < l; i++)
881     ASSERT (mpz_cmp (r2[i], r2[2UL * l - 2UL - i]) == 0);
882 #endif
883   free (Srev);
884   /* r2 = g1*f0/2 + (g0*f0/4 + g1*f1) * x + g0*f1/2 * x^2 */
885 #if 0
886   for (i = 0; i < 2UL * l - 1UL; i++)
887     gmp_printf ("list_sqr_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
888 #endif
889 
890   mpz_mul_2exp (r1[0], r1[0], 1UL);
891   /* r1 = f0*g0/2 + (f0*g1 + f1*g0)/2 * x + f1*g1 * x^2 */
892   for (i = 0UL; i < l; i++)
893     {
894       mpz_mul_2exp (r2[l - i - 1UL], r2[l - i - 1UL], 1UL);
895       mpz_add (R[i], r1[i], r2[l - i - 1UL]);
896     }
897   /* r1 = 3/4*f0*g0 + g1*f1 + (f0*g1 + 2*f1*g0)/2 * x + f1*g1 * x^2 */
898   /* r1 = f0*g0 + 2*g1*f1 + (f0*g1 + f1*g0) * x + f1*g1 * x^2 */
899   for (i = l; i < 2UL * l - 1UL; i++)
900       mpz_set (R[i], r1[i]);
901 
902   if (R != S)
903     mpz_mul_2exp (S[0], S[0], 1UL);
904 
905 #if 0
906   for (i = 0; i < 2UL * l; i++)
907     gmp_printf ("list_sqr_reciprocal: R[%lu] = %Zd\n", i, R[i]);
908 #endif
909 }
910 
911 #ifdef WANT_ASSERT
912 static void
list_recip_eval1(mpz_t R,const listz_t S,const unsigned long l)913 list_recip_eval1 (mpz_t R, const listz_t S, const unsigned long l)
914 {
915   unsigned long i;
916 
917   mpz_set_ui (R, 0UL);
918   for (i = 1; i < l; i++)
919     mpz_add (R, R, S[i]);
920   mpz_mul_2exp (R, R, 1UL);
921   if (l > 0UL)
922     mpz_add (R, R, S[0]);
923 }
924 #endif
925 
926 /* Multiply two reciprocal polynomials of degree 2*l1-2 and 2*l2-2, resp.,
927    with coefficients in standard basis
928 
929    S_1(x) = S1[0] + sum_{1 \leq i \leq l1 - 1} S1[i] (x^i + x^{-i})
930    S_2(x) = S2[0] + sum_{1 \leq i \leq l2 - 1} S2[i] (x^i + x^{-i})
931 
932    to the reciprocal polynomial of degree 2*(l1 + l2) - 4
933 
934    R(x) = R[0] + sum_{1 \leq i \leq l1 + l2 - 2} R[i] (x^i + x^{-i})
935         = S_1(x) * S_2(x)
936 
937    R == S1 == S2 is permissible, however if S1 == S2, l1 must be equal
938    to l2 (i.e. the multiplication must be a squaring)
939 */
940   /* FIXME: This modifies the input arguments. */
941   /* We have to divide S1[0] and S2[0] by 2 */
942 
943 static void
list_mul_reciprocal(listz_t R,listz_t S1,unsigned long l1,listz_t S2,unsigned long l2,mpz_t modulus,listz_t tmp,ATTRIBUTE_UNUSED const unsigned long tmplen)944 list_mul_reciprocal (listz_t R, listz_t S1, unsigned long l1,
945 		     listz_t S2, unsigned long l2,
946 		     mpz_t modulus, listz_t tmp,
947 		     ATTRIBUTE_UNUSED const unsigned long tmplen)
948 {
949   unsigned long i;
950   const unsigned long lmax = MAX(l1, l2);
951   listz_t r1 = tmp, r2 = tmp + 2*lmax - 1, rev = tmp + 4*lmax - 2,
952     t = tmp + 6*lmax - 3;
953 #ifdef WANT_ASSERT
954   mpz_t sum1, sum2, prod;
955 #endif
956 
957   ASSERT (S1 < tmp || S1 >= tmp + tmplen);
958   ASSERT (S2 < tmp || S2 >= tmp + tmplen);
959   ASSERT (R < tmp || R >= tmp + tmplen);
960 
961   if (l1 == 0UL || l2 == 0UL)
962     return;
963 
964   if (S1 == S2)
965     {
966       ASSERT_ALWAYS (l1 == l2);
967       list_sqr_reciprocal (R, S1, l1, modulus, tmp, tmplen);
968       return;
969     }
970 
971   ASSERT (tmplen >= 6*lmax - 3 + list_mul_mem (lmax));
972 #ifdef WANT_ASSERT
973   mpz_init (sum1);
974   mpz_init (sum2);
975   mpz_init (prod);
976   list_recip_eval1 (sum1, S1, l1);
977   list_recip_eval1 (sum2, S2, l2);
978   mpz_mul (prod, sum1, sum2);
979   mpz_mod (prod, prod, modulus);
980 #endif
981 
982 
983   /* Make S1 the longer of the two, i.e. l1 >= l2 */
984   if (l2 > l1)
985     {
986       listz_t St = S1;
987       unsigned long lt = l1;
988       S1 = S2;
989       S2 = St;
990       l1 = l2;
991       l2 = lt;
992     }
993 
994 #if 0
995   gmp_printf ("/* list_mul_reciprocal */ S1(x) = %Zd", S1[0]);
996   for (i = 1; i < l1; i++)
997     gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S1[i], i, i);
998   gmp_printf ("\n");
999   gmp_printf ("/* list_mul_reciprocal */ S2(x) = %Zd", S2[0]);
1000   for (i = 1; i < l1; i++)
1001     gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S2[i], i, i);
1002   gmp_printf ("\n");
1003 #endif
1004 
1005   /* Divide S1[0] and S2[0] by 2 */
1006   if (mpz_odd_p (S1[0]))
1007     {
1008       ASSERT_ALWAYS (mpz_odd_p (modulus));
1009       mpz_add (S1[0], S1[0], modulus);
1010     }
1011   mpz_tdiv_q_2exp (S1[0], S1[0], 1UL);
1012 
1013   if (mpz_odd_p (S2[0]))
1014     {
1015       ASSERT_ALWAYS (mpz_odd_p (modulus));
1016       mpz_add (S2[0], S2[0], modulus);
1017     }
1018   mpz_tdiv_q_2exp (S2[0], S2[0], 1UL);
1019 
1020   /* Pad rev with zeros */
1021   for (i = l2; i < lmax; i++)
1022     mpz_set_ui (rev[i], 0UL);
1023 
1024   for (i = 0UL; i < l2; i++)
1025     mpz_set (rev[i], S2[l2 - 1UL - i]);
1026   list_mul (r1, S1, lmax, rev, lmax, 0, t);
1027   /* r1 = \tilde{f}(x) \rev(\tilde{g}(x)) and has degree l1 + l2 - 2,
1028      i.e. l1 + l2 - 1 entries. */
1029 #if 0
1030   for (i = 0; i < 2 * lmax - 1; i++)
1031     gmp_printf ("list_mul_reciprocal: r1[%lu] = %Zd\n", i, r1[i]);
1032 #endif
1033 
1034   for (i = 0UL; i < l2; i++)
1035     mpz_set(rev[i], S2[i]);
1036   list_mul (r2, S1, lmax, rev, lmax, 0, t);
1037   /* \tilde{f}(x) \tilde{g}(x) */
1038 
1039 #if 0
1040   for (i = 0; i < 2 * lmax - 1; i++)
1041     gmp_printf ("list_mul_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
1042 #endif
1043 
1044   /* Add f_0*g_0 by doubling the f_0*g_0 term in r2 */
1045   mpz_mul_2exp (r2[0], r2[0], 1UL);
1046 
1047   /* Add \flloor x^{-d_g} \tilde{f}(x) \rev(\tilde{g}(x)) \rfloor.
1048      d_g = l2 - 1. */
1049   for (i = 0; i < l1; i++)
1050     mpz_add (r2[i], r2[i], r1[i + l2 - 1]);
1051 
1052   /* Add \floor x^{-d_f} rev(\tilde{f}(x) \rev(\tilde{g}(x))) \rfloor.
1053      d_f = l1 - 1. rev(r2)[i] = r2[l1 + l2 - 2 - i]. We want
1054      rev(r2)[l1 - 1 ... l1 + l2 - 2], hence
1055      r2[l2 - 1 ... 0] */
1056   for (i = 0; i < l2; i++)
1057     mpz_add (r2[i], r2[i], r1[l2 - 1 - i]);
1058 
1059 #if 0
1060   for (i = 0; i < l1 + l2 - 1; i++)
1061     gmp_printf ("list_mul_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
1062 #endif
1063 
1064   mpz_mul_2exp (S1[0], S1[0], 1UL);
1065   mpz_mul_2exp (S2[0], S2[0], 1UL);
1066 
1067   for (i = 0; i < l1 + l2 - 1; i++)
1068     mpz_set (R[i], r2[i]);
1069 
1070 #if 0
1071   for (i = 0; i < l1 + l2 - 1; i++)
1072     gmp_printf ("list_mul_reciprocal: R[%lu] = %Zd\n", i, R[i]);
1073 #endif
1074 #ifdef WANT_ASSERT
1075   list_recip_eval1 (sum1, R, l1 + l2 - 1);
1076   mpz_mod (sum1, sum1, modulus);
1077   ASSERT (mpz_cmp (prod, sum1) == 0);
1078   mpz_clear (sum1);
1079   mpz_clear (sum2);
1080   mpz_clear (prod);
1081 #endif
1082 }
1083 
1084 /*
1085   Computes V_k(S), where the Chebyshev polynomial V_k(X) is defined by
1086   V_k(X + 1/X) = X^k + 1/X^k
1087 */
1088 
1089 static void
V(mpres_t R,const mpres_t S,const long k,mpmod_t modulus)1090 V (mpres_t R, const mpres_t S, const long k, mpmod_t modulus)
1091 {
1092   mpres_t V0, Vi, Vi1;
1093   unsigned long j, uk;
1094   int po2;
1095 
1096   if (k == 0L)
1097     {
1098       mpres_set_ui (R, 2UL, modulus);
1099       return;
1100     }
1101 
1102   uk = labs (k);
1103 
1104   if (uk == 1UL)
1105     {
1106       mpres_set (R, S, modulus);
1107       return;
1108     }
1109 
1110   for (po2 = 0; uk % 2UL == 0UL; uk >>= 1, po2++);
1111 
1112   mpres_init (V0, modulus);
1113   mpres_set_ui (V0, 2UL, modulus); /* V0 = V_0(S) = 2 */
1114 
1115   if (uk == 1UL)
1116     {
1117       mpres_set (R, S, modulus);
1118       while (po2-- > 0)
1119         {
1120           mpres_sqr (R, R, modulus);
1121           mpres_sub (R, R, V0, modulus);
1122         }
1123       mpres_clear (V0, modulus);
1124       return;
1125     }
1126 
1127   if (0)
1128     {
1129       mpz_t tz;
1130       mpz_init (tz);
1131       mpres_get_z (tz, S, modulus);
1132       gmp_printf ("Chebyshev_V(%ld, Mod(%Zd,N)) == ", k, tz);
1133       mpz_clear (tz);
1134     }
1135 
1136   for (j = 1UL; j <= uk / 2UL; j <<= 1);
1137 
1138   mpres_init (Vi, modulus);
1139   mpres_init (Vi1, modulus);
1140 
1141   /* i = 1. Vi = V_i(S), Vi1 = V_{i+1}(S) */
1142   mpres_set (Vi, S, modulus);
1143   mpres_sqr (Vi1, S, modulus);
1144   mpres_sub (Vi1, Vi1, V0, modulus);
1145   j >>= 1;
1146 
1147   while (j > 1)
1148     {
1149       if ((uk & j) != 0UL)
1150 	{
1151 	  /* i' = 2i + 1.
1152 	     V_{i'} = V_{2i + 1} = V_{i+1 + i} = V_{i+1} * V_{i} - V_1
1153 	     V_{i'+1} = V_{2i + 2} = {V_{i+1}}^2 - V_0. */
1154 	  mpres_mul (Vi, Vi, Vi1, modulus);
1155 	  mpres_sub (Vi, Vi, S, modulus);
1156 	  mpres_sqr (Vi1, Vi1, modulus);
1157 	  mpres_sub (Vi1, Vi1, V0, modulus);
1158 	}
1159       else
1160 	{
1161 	  /* i' = 2i.
1162 	     V_{i'} = V_{2i} = {V_i}^2 - V0.
1163 	     V_{i'+1} = V_{2i + 1} = V_{i+1 + i} = V_{i+1} * V_{i} - V_1 */
1164 	  mpres_mul (Vi1, Vi, Vi1, modulus);
1165 	  mpres_sub (Vi1, Vi1, S, modulus);
1166 
1167 	  mpres_sqr (Vi, Vi, modulus);
1168 	  mpres_sub (Vi, Vi, V0, modulus);
1169 	}
1170       j >>= 1;
1171     }
1172 
1173   /* Least significant bit of uk is always 1 */
1174   mpres_mul (Vi, Vi, Vi1, modulus);
1175   mpres_sub (Vi, Vi, S, modulus);
1176 
1177   while (po2-- > 0)
1178     {
1179       mpres_sqr (Vi, Vi, modulus);
1180       mpres_sub (Vi, Vi, V0, modulus);
1181     }
1182 
1183   mpres_set (R, Vi, modulus);
1184 
1185   mpres_clear (Vi, modulus);
1186   mpres_clear (Vi1, modulus);
1187   mpres_clear (V0, modulus);
1188 
1189   if (0)
1190     {
1191       mpz_t tz;
1192       mpz_init (tz);
1193       mpres_get_z (tz, R, modulus);
1194       gmp_printf ("%Zd\n", tz);
1195       mpz_clear (tz);
1196     }
1197 }
1198 
1199 /*
1200   Computes U_k(S), where the Chebyshev polynomial U_k(X) is defined by
1201   U_k(X + 1/X) = (X^k - 1/X^k) / (X - 1/X)
1202   If R1 != NULL, stores U_{k+1}(S) there
1203 */
1204 
1205 static void
U(mpres_t R,mpres_t R1,const mpres_t S,const long k,mpmod_t modulus)1206 U (mpres_t R, mpres_t R1, const mpres_t S, const long k, mpmod_t modulus)
1207 {
1208   mpres_t V0, Vi, Vi1, Ui, Ui1, t;
1209   unsigned long j, uk;
1210 
1211   if (k == 0L)
1212     {
1213       mpres_set_ui (R, 0UL, modulus); /* U_0 = 0 */
1214       if (R1 != NULL)
1215 	mpres_set_ui (R1, 1UL, modulus); /* U_1 = 1 */
1216       return;
1217     }
1218 
1219   uk = labs (k);
1220 
1221   if (uk == 1UL)
1222     {
1223       mpres_set_ui (R, 1UL, modulus);
1224       if (k == -1)
1225 	mpres_neg (R, R, modulus);
1226 
1227       if (R1 != NULL)
1228 	{
1229 	  if (k == -1)
1230 	    mpres_set_ui (R1, 0UL, modulus);
1231 	  else
1232 	    mpres_set (R1, S, modulus); /* U_2(S) = S */
1233 	}
1234 
1235       return;
1236     }
1237 
1238   if (0)
1239     {
1240       mpz_t tz;
1241       mpz_init (tz);
1242       mpres_get_z (tz, S, modulus);
1243       gmp_printf ("Chebyshev_U(%ld, Mod(%Zd,N)) == ", k, tz);
1244       mpz_clear (tz);
1245     }
1246 
1247   mpres_init (V0, modulus);
1248   mpres_init (Vi, modulus);
1249   mpres_init (Vi1, modulus);
1250   mpres_init (Ui, modulus);
1251   mpres_init (Ui1, modulus);
1252   mpres_init (t, modulus);
1253 
1254   for (j = 1UL; j <= uk / 2UL; j <<= 1);
1255 
1256   mpres_set_ui (Ui, 1UL, modulus);   /* Ui = U_1(S) = 1 */
1257   mpres_set (Ui1, S, modulus);       /* Ui1 = U_2(S) = S */
1258   mpres_add (V0, Ui, Ui, modulus);   /* V0 = V_0(S) = 2 */
1259   mpres_set (Vi, S, modulus);        /* Vi = V_1(S) = S */
1260   mpres_sqr (Vi1, Vi, modulus);
1261   mpres_sub (Vi1, Vi1, V0, modulus); /* Vi1 = V_2(S) = S^2 - 2 */
1262   j >>= 1; /* i = 1 */
1263 
1264   while (j != 0)
1265     {
1266       if ((uk & j) == 0UL)
1267 	{
1268 	  mpres_mul (Vi1, Vi1, Vi, modulus);
1269 	  mpres_sub (Vi1, Vi1, S, modulus); /* V_{2i+1} = V_{i+1} V_i - V_1 */
1270 	  /* U_{2i+1} = (U_{i+1} + U_i) (U_{i+1} - U_i) */
1271 	  mpres_sub (t, Ui1, Ui, modulus);
1272 	  mpres_add (Ui1, Ui1, Ui, modulus);
1273 	  mpres_mul (Ui1, Ui1, t, modulus);
1274 	  mpres_mul (Ui, Ui, Vi, modulus); /* U_{2n} = U_n V_n */
1275 	  mpres_sqr (Vi, Vi, modulus);
1276 	  mpres_sub (Vi, Vi, V0, modulus); /* V_{2n} = V_n^2 - 2 */
1277 	}
1278       else
1279 	{
1280 	  /* U_{2i+1} = (U_{i+1} + U_i) (U_{i+1} - U_i) */
1281 	  mpres_sub (t, Ui1, Ui, modulus);
1282 	  mpres_add (Ui, Ui, Ui1, modulus);
1283 	  mpres_mul (Ui, Ui, t, modulus);
1284 	  mpres_mul (Ui1, Ui1, Vi1, modulus); /* U_{2n+2} = U_{n+1} V_{n+1} */
1285 	  mpres_mul (Vi, Vi, Vi1, modulus);
1286 	  mpres_sub (Vi, Vi, S, modulus); /* V_{2i+1} = V_{i+1} V_i - V_1 */
1287 	  mpres_sqr (Vi1, Vi1, modulus);
1288 	  mpres_sub (Vi1, Vi1, V0, modulus); /* V_{2n+2} = V_{n+1}^2 - 2 */
1289 	}
1290       j >>= 1;
1291     }
1292 
1293   if (k > 0)
1294     mpres_set (R, Ui, modulus);
1295   else
1296     mpres_neg (R, Ui, modulus);
1297 
1298   if (R1 != NULL)
1299     {
1300       /* Here k != -1,0,1, so k+1 is negative iff k is */
1301       if (k > 0)
1302 	mpres_set (R1, Ui1, modulus);
1303       else
1304 	mpres_neg (R1, Ui1, modulus);
1305     }
1306 
1307   mpres_clear (V0, modulus);
1308   mpres_clear (Vi, modulus);
1309   mpres_clear (Vi1, modulus);
1310   mpres_clear (Ui, modulus);
1311   mpres_clear (Ui1, modulus);
1312   mpres_clear (t, modulus);
1313 
1314   if (0)
1315     {
1316       mpz_t tz;
1317       mpz_init (tz);
1318       mpres_get_z (tz, R, modulus);
1319       gmp_printf ("%Zd\n", tz);
1320       mpz_clear (tz);
1321     }
1322 }
1323 
1324 
1325 /* Set R[i] = V_{i+k}(Q) * F[i] or U_{i+k}(Q) * F[i], for 0 <= i < len
1326    We compute V_{i+k+1}(Q) by V_{i+k}(Q)*V_1(Q) - V_{i+k-1}(Q).
1327    For U, we compute U_{i+k+1}(Q) by U_{i+k}(Q)*V_1(Q) - U_{i+k-1}(Q).
1328    The values of V_1(Q), V_{k-1}(Q) and V_k(Q) and V_k(Q) are in
1329    V1, Vk_1 and Vk, resp.
1330    The values of Vk_1 and Vk are clobbered. */
1331 static void
scale_by_chebyshev(listz_t R,const listz_t F,const unsigned long len,mpmod_t modulus,const mpres_t V1,mpres_t Vk_1,mpres_t Vk)1332 scale_by_chebyshev (listz_t R, const listz_t F, const unsigned long len,
1333                     mpmod_t modulus, const mpres_t V1, mpres_t Vk_1,
1334                     mpres_t Vk)
1335 {
1336   mpres_t Vt;
1337   unsigned long i;
1338 
1339   mpres_init (Vt, modulus);
1340 
1341   for (i = 0; i < len; i++)
1342     {
1343       mpres_mul_z_to_z (R[i], Vk, F[i], modulus);
1344       mpres_mul (Vt, Vk, V1, modulus);
1345       mpres_sub (Vt, Vt, Vk_1, modulus);
1346       mpres_set (Vk_1, Vk, modulus); /* Could be a swap */
1347       mpres_set (Vk, Vt, modulus); /* Could be a swap */
1348     }
1349 
1350   mpres_clear (Vt, modulus);
1351 }
1352 
1353 
1354 /* For a given reciprocal polynomial
1355    F(x) = f_0 + sum_{i=1}^{deg} f_i V_i(x+1/x),
1356    compute F(\gamma x)F(\gamma^{-1} x), with Q = \gamma + 1 / \gamma
1357 
1358    If NTT is used, needs 4 * deg + 3 entries in tmp.
1359    If no NTT is used, needs 4 * deg + 2 + (memory use of list_sqr_reciprocal)
1360 */
1361 
1362 static void
list_scale_V(listz_t R,const listz_t F,const mpres_t Q,const unsigned long deg,mpmod_t modulus,listz_t tmp,const unsigned long tmplen,mpzspv_t dct,const mpzspm_t ntt_context)1363 list_scale_V (listz_t R, const listz_t F, const mpres_t Q,
1364               const unsigned long deg, mpmod_t modulus, listz_t tmp,
1365               const unsigned long tmplen,
1366 	      mpzspv_t dct, const mpzspm_t ntt_context)
1367 {
1368   mpres_t Vt;
1369   unsigned long i;
1370   const listz_t G = tmp, H = tmp + 2 * deg + 1, newtmp = tmp + 4 * deg + 2;
1371   const unsigned long newtmplen = tmplen - 4 * deg - 2;
1372 #ifdef WANT_ASSERT
1373   mpz_t leading;
1374 #endif
1375 
1376   if (deg == 0)
1377     {
1378       ASSERT(tmplen >= 1);
1379       mpz_mul (tmp[0], F[0], F[0]);
1380       mpz_mod (R[0], tmp[0], modulus->orig_modulus);
1381       return;
1382     }
1383 
1384   /* Make sure newtmplen does not underflow */
1385   ASSERT_ALWAYS (tmplen >= 4 * deg + 2);
1386 #ifdef WANT_ASSERT
1387   mpz_init (leading);
1388   mpz_mul (leading, F[deg], F[deg]);
1389   mpz_mod (leading, leading, modulus->orig_modulus);
1390 #endif
1391 
1392   /* Generate V_1(Q)/2 ... V_{deg}(Q)/2, multiply by f_i to form coefficients
1393      of G(x). Square the symmetric G(x) polynomial. */
1394 
1395   outputf (OUTPUT_TRACE, "list_scale_V: Q=%Zd, deg = %lu\n", Q, deg);
1396   list_output_poly (F, deg + 1, 0, 1, "/* list_scale_V */ F(x) = ", "\n",
1397 		    OUTPUT_TRACE);
1398 
1399   /* Compute G[i] = V_i(Q)/2 * F[i] for i = 0, ..., deg.
1400      For i=0, V_0(Q) = 2, so G[0] = F[0],
1401      which leaves deg entries to process */
1402 
1403   mpz_set (G[0], F[0]);
1404 
1405 #if defined(_OPENMP)
1406 #pragma omp parallel if (deg > 1000)
1407 #endif
1408   {
1409     const int nr_chunks = omp_get_num_threads();
1410     const int thread_nr = omp_get_thread_num();
1411     mpmod_t modulus_local;
1412     unsigned long l, start_i;
1413     mpres_t Vi, Vi_1;
1414 
1415     l = (deg - 1) / nr_chunks + 1; /* l = ceil (deg / nr_chunks) */
1416     start_i = thread_nr * l + 1;
1417     if (start_i <= deg + 1)
1418       l = MIN(l, deg + 1 - start_i);
1419     else
1420       l = 0;
1421 
1422     mpmod_init_set (modulus_local, modulus);
1423     mpres_init (Vi_1, modulus_local);
1424     mpres_init (Vi, modulus_local);
1425 
1426     V (Vi, Q, start_i, modulus_local);
1427     mpres_div_2exp (Vi, Vi, 1, modulus_local);
1428     V (Vi_1, Q, start_i - 1UL, modulus_local);
1429     mpres_div_2exp (Vi_1, Vi_1, 1, modulus_local);
1430     scale_by_chebyshev (G + start_i, F + start_i, l, modulus_local,
1431                         Q, Vi_1, Vi);
1432 
1433     mpres_clear (Vi_1, modulus_local);
1434     mpres_clear (Vi, modulus_local);
1435     mpmod_clear (modulus_local);
1436   }
1437 
1438 
1439   list_output_poly (G, deg + 1, 0, 1, "/* list_scale_V */ G(x) = ", "\n",
1440 		    OUTPUT_TRACE);
1441 
1442   /* Now square the G polynomial in G[0 .. deg], put result in
1443      G[0 .. 2*deg] */
1444 
1445   /* Bugfix: ks_multiply() does not like negative coefficients. FIXME */
1446 
1447   for (i = 0; i <= deg; i++)
1448     if (mpz_sgn (G[i]) < 0)
1449       {
1450 	mpz_add (G[i], G[i], modulus->orig_modulus);
1451 	/* FIXME: make sure the absolute size does not "run away" */
1452 	if (mpz_sgn (G[i]) < 0)
1453 	  {
1454 	    outputf (OUTPUT_ERROR, "list_scale_V: G[%lu] still negative\n", i);
1455 	    mpz_mod (G[i], G[i], modulus->orig_modulus);
1456 	  }
1457       }
1458 
1459   if (dct != NULL && ntt_context != NULL)
1460     ntt_sqr_reciprocal (G, G, dct, deg + 1, ntt_context);
1461   else
1462     list_sqr_reciprocal (G, G, deg + 1, modulus->orig_modulus,
1463                          newtmp, newtmplen);
1464 
1465   list_output_poly (G, 2 * deg + 1, 0, 1, "/* list_scale_V */ G(x)^2 == ",
1466 		    "\n", OUTPUT_TRACE);
1467 
1468   /* Compute H[i-1] = U_i(Q)/2 * F[i] for i = 1, ..., deg */
1469 
1470 #if defined(_OPENMP)
1471 #pragma omp parallel if (deg > 1000)
1472 #endif
1473   {
1474     const int nr_chunks = omp_get_num_threads();
1475     const int thread_nr = omp_get_thread_num();
1476     mpmod_t modulus_local;
1477     unsigned long l, start_i;
1478     mpres_t Ui, Ui_1;
1479 
1480     l = (deg - 1) / nr_chunks + 1; /* l = ceil(deg / nr_chunks) */
1481     start_i = thread_nr * l + 1UL;
1482     if (start_i <= deg + 1)
1483       l = MIN(l, deg + 1 - start_i);
1484     else
1485       l = 0;
1486 
1487     mpmod_init_set (modulus_local, modulus);
1488     mpres_init (Ui_1, modulus_local);
1489     mpres_init (Ui, modulus_local);
1490 
1491     U (Ui_1, Ui, Q, start_i - 1, modulus_local);
1492     mpres_div_2exp (Ui, Ui, 1, modulus_local);
1493     mpres_div_2exp (Ui_1, Ui_1, 1, modulus_local);
1494 
1495     scale_by_chebyshev (H - 1 + start_i, F + start_i, l, modulus_local,
1496                         Q, Ui_1, Ui);
1497 
1498     mpres_clear (Ui_1, modulus_local);
1499     mpres_clear (Ui, modulus_local);
1500     mpmod_clear (modulus_local);
1501   }
1502 
1503 
1504   /* Convert H to standard basis */
1505   /* We can do it in-place with H - 1 = H_U. */
1506 
1507   for (i = deg; i >= 3; i--)
1508     {
1509       mpz_add (H[i - 3], H[i - 3], H[i - 1]);
1510       if (mpz_cmp (H[i - 3], modulus->orig_modulus) >= 0)
1511         mpz_sub (H[i - 3], H[i - 3], modulus->orig_modulus);
1512     }
1513 
1514   /* U_2(X+1/X) = (X^2 - 1/X^2)/(X-1/X) = X+1/X = V_1(X+1/X),
1515      so no addition occures here */
1516   /* if (deg >= 2)
1517      mpz_set (H[1], H[1]); Again, a no-op. */
1518 
1519   /* U_1(X+1/X) = 1, so this goes to coefficient of index 0 in std. basis */
1520   /* mpz_set (H[0], H[0]); Another no-op. */
1521 
1522   /* Now H[0 ... deg-1] contains the deg coefficients in standard basis
1523      of symmetric H(X) of degree 2*deg-2. */
1524 
1525   list_output_poly (H, deg, 0, 1, "/* list_scale_V */ H(x) = ", "\n",
1526 		    OUTPUT_TRACE);
1527 
1528   /* Square the symmetric H polynomial of degree 2*deg-2 (i.e. with deg
1529      coefficents in standard basis in H[0 ... deg-1]) */
1530 
1531   /* Bugfix: ks_multiply() does not like negative coefficients. */
1532 
1533   for (i = 0; i <= deg; i++)
1534     if (mpz_sgn (H[i]) < 0)
1535       {
1536 	mpz_add (H[i], H[i], modulus->orig_modulus);
1537 	if (mpz_sgn (H[i]) < 0)
1538 	  {
1539 	    outputf (OUTPUT_ERROR, "list_scale_V: H[%lu] still negative\n", i);
1540 	    mpz_mod (H[i], H[i], modulus->orig_modulus);
1541 	  }
1542       }
1543 
1544   if (dct != NULL && ntt_context != NULL)
1545     ntt_sqr_reciprocal (H, H, dct, deg, ntt_context);
1546   else
1547     list_sqr_reciprocal (H, H, deg, modulus->orig_modulus,
1548   		         newtmp, newtmplen);
1549 
1550   /* Now there are the 2*deg-1 coefficients in standard basis of a
1551      symmetric polynomial of degree 4*deg - 4 in H[0 ... 2*deg-2] */
1552 
1553   list_output_poly (H, 2*deg - 1, 0, 1, "/* list_scale_V */ H(x)^2 == ", "\n",
1554 		    OUTPUT_TRACE);
1555 
1556   /* Multiply by Q^2-4 */
1557   mpres_init (Vt, modulus);
1558   mpres_sqr (Vt, Q, modulus);
1559   mpres_sub_ui (Vt, Vt, 4, modulus);
1560 
1561 #if defined(_OPENMP)
1562 #pragma omp parallel if (deg > 1000)
1563   {
1564     mpmod_t modulus_local;
1565     long i; /* OpenMP insists on signed loop iteration var :( */
1566 
1567     mpmod_init_set (modulus_local, modulus);
1568 
1569 #pragma omp for
1570     for (i = 0; (unsigned long) i <= 2 * deg - 2; i++)
1571       mpres_mul_z_to_z (H[i], Vt, H[i], modulus_local);
1572     mpmod_clear (modulus_local);
1573   }
1574 #else
1575   for (i = 0; (unsigned long) i <= 2 * deg - 2; i++)
1576     mpres_mul_z_to_z (H[i], Vt, H[i], modulus);
1577 #endif
1578 
1579   list_output_poly (H, 2 * deg - 1, 0, 1, "/* list_scale_V */ "
1580 		    "H(x)^2*(Q^2-4) == ", "\n", OUTPUT_TRACE);
1581 
1582 
1583   /* Multiply by (X - 1/X)^2 = X^2 - 2 + 1/X^2 and subtract from G */
1584   ASSERT (newtmplen > 0UL);
1585   if (deg == 1)
1586     {
1587       /* H(X) has degree 2*deg-2 = 0, so H(X) = h_0
1588 	 H(X) * (X - 1/X)^2 = -2 h_0 + h_0 V_2(Y)  */
1589       mpz_mul_2exp (newtmp[0], H[0], 1UL);
1590       mpz_add (G[0], G[0], newtmp[0]); /* G[0] -= -2*H[0] */
1591       mpz_sub (G[2], G[2], H[0]);
1592     }
1593   else if (deg == 2)
1594     {
1595       /* H(X) has degree 2*deg-2 = 2, , so
1596 	 H(X) = h_0 + h_1 (X+1/X) + h_2 (X^2+1/X^2)
1597 
1598 	 H(X) * (X - 1/X)^2 =
1599 	 -2*(h_0 - h_2) - h_1 * V_1(Y) + (h_0 - 2*h_2) * V_2(Y) +
1600 	 h_1 * V_3(Y) + h_2 * V_4(Y)
1601       */
1602       mpz_sub (newtmp[0], H[0], H[2]);          /* h_0 - h_2 */
1603       mpz_mul_2exp (newtmp[0], newtmp[0], 1UL); /* 2*(h_0 - h_2) */
1604       mpz_add (G[0], G[0], newtmp[0]);          /* G[0] -= -2*(h_0 - h_2) */
1605 
1606       mpz_add (G[1], G[1], H[1]);               /* G[1] -= -h_1 */
1607       mpz_sub (newtmp[0], newtmp[0], H[0]);     /* h_0 - 2*h_2 */
1608       mpz_sub (G[2], G[2], newtmp[0]);          /* G[2] -= h_0 - 2*h_2 */
1609       mpz_sub (G[3], G[3], H[1]);               /* G[3] -= h_1 */
1610       mpz_sub (G[4], G[4], H[2]);               /* G[3] -= h_2 */
1611     }
1612   else
1613     {
1614       /* Let H(X) = h_0 + \sum_{i=1}^{n} h_i V_i(Y), Y = X+1/X. Then
1615 	 (x - 1/x)^2 H(X) =
1616 	 -2(h_0 - h_2) +
1617 	 (- h_1 + h_3) V_1(Y) +
1618 	 \sum_{i=2}^{n-2} (h_{i-2} - 2h_i + h_{i+2}) V_i(Y) +
1619 	 (h_{n-3} - 2h_{n-1}) V_{n-1}(Y) +
1620 	 (h_{n-2} - 2h_n) V_n(Y) +
1621 	 h_{n-1} V_{n+1}(Y) +
1622 	 h_n V_{n+2}(Y)
1623 
1624 	 In our case, n = 2 * deg - 2
1625       */
1626       mpz_sub (newtmp[0], H[0], H[2]);
1627       mpz_mul_2exp (newtmp[0], newtmp[0], 1UL); /* t[0] = 2*(h_0 - h_2) */
1628       mpz_add (G[0], G[0], newtmp[0]);          /* G[0] -= -2*(h_0 - h_2) */
1629 
1630       mpz_add (G[1], G[1], H[1]);
1631       mpz_sub (G[1], G[1], H[3]); /* G[1] -= -h_1 + h_3 */
1632 
1633       for (i = 2; i <= 2 * deg - 4; i++)
1634 	{
1635 	  mpz_mul_2exp (newtmp[0], H[i], 1);
1636 	  mpz_sub (newtmp[0], newtmp[0], H[i - 2]);
1637 	  mpz_sub (newtmp[0], newtmp[0], H[i + 2]); /* 2h_i-h_{i-2}-h_{i+2} */
1638 	  mpz_add (G[i], G[i], newtmp[0]); /* G[i] -= -2h_i+h_{i-2}+h_{i+2} */
1639 	}
1640 
1641       for ( ; i <= 2 * deg - 2; i++)
1642 	{
1643 	  mpz_mul_2exp (newtmp[0], H[i], 1UL);
1644 	  mpz_sub (newtmp[0], H[i - 2], newtmp[0]); /* h_{n-3} - 2h_{n-1} */
1645 	  mpz_sub (G[i], G[i], newtmp[0]);
1646 	}
1647 
1648       mpz_sub (G[i], G[i], H[i - 2]);
1649       mpz_sub (G[i + 1], G[i + 1], H[i - 1]);
1650     }
1651 
1652   for (i = 0; i <= 2 * deg; i++)
1653     mpz_mod (R[i], G[i], modulus->orig_modulus);
1654 
1655   if (test_verbose (OUTPUT_TRACE))
1656     for (i = 0; i <= 2 * deg; i++)
1657       outputf (OUTPUT_TRACE, "list_scale_V: R[%lu] = %Zd\n", i, R[i]);
1658 
1659 #ifdef WANT_ASSERT
1660   mpz_mod (R[2 * deg], R[2 * deg], modulus->orig_modulus);
1661   ASSERT (mpz_cmp (leading, R[2 * deg]) == 0);
1662   mpz_clear (leading);
1663 #endif
1664 
1665   mpres_clear (Vt, modulus);
1666 }
1667 
1668 
1669 #ifdef WANT_ASSERT
1670 /* Check if l is an (anti-)symmetric, possibly monic, polynomial.
1671    Returns -1 if it is (anti-)symmetric, or the smallest index i where
1672    l[i] != l[len - 1 + monic - i])
1673    If anti == 1, the list is checked for symmetry, if it is -1, for
1674    antisymmetry.
1675    This function is used only if assertions are enabled.
1676 */
1677 
1678 static long int ATTRIBUTE_UNUSED
list_is_symmetric(listz_t l,unsigned long len,int monic,int anti,mpz_t modulus,mpz_t tmp)1679 list_is_symmetric (listz_t l, unsigned long len, int monic, int anti,
1680 		   mpz_t modulus, mpz_t tmp)
1681 {
1682     unsigned long i;
1683 
1684     ASSERT (monic == 0 || monic == 1);
1685     ASSERT (anti == 1 || anti == -1);
1686 
1687     if (monic && anti == 1 && mpz_cmp_ui (l[0], 1) != 0)
1688 	return 0L;
1689 
1690     if (monic && anti == -1)
1691       {
1692 	mpz_sub_ui (tmp, modulus, 1);
1693 	if (mpz_cmp (tmp, l[0]) != 0)
1694 	  return 0L;
1695       }
1696 
1697     for (i = monic; i < len / 2; i++)
1698       {
1699 	if (anti == -1)
1700 	  {
1701 	    /* Negate (mod modulus) */
1702 	    if (mpz_sgn (l[i]) == 0)
1703 	      {
1704 		if (mpz_sgn (l[len - 1 + monic - i]) != 0)
1705 		  return (long) i;
1706 	      }
1707 	    else
1708 	      {
1709 		mpz_sub (tmp, modulus, l[i]);
1710 		if (mpz_cmp (tmp, l[len - 1 + monic - i]) != 0)
1711 		  return (long) i;
1712 	      }
1713 	  }
1714 	else if (mpz_cmp (l[i], l[len - 1 + monic - i]) != 0)
1715 	    return (long) i;
1716       }
1717 
1718     return -1L;
1719 }
1720 #endif
1721 
1722 #if 0 && defined(WANT_ASSERT)
1723 /* Evaluate a polynomial of degree n-1 with all coefficients given in F[],
1724    or of degree n with an implicit leading 1 monomial not stored in F[],
1725    at x modulo modulus. Result goes in r. tmp needs 2 entries. */
1726 
1727 static void
1728 list_eval_poly (mpz_t r, const listz_t F, const mpz_t x,
1729 		const unsigned long n, const int monic, const mpz_t modulus,
1730 		listz_t tmp)
1731 {
1732   unsigned long i;
1733 
1734   mpz_set_ui (tmp[0], 1UL);
1735   mpz_set_ui (r, 0UL);
1736 
1737   for (i = 0UL; i < n; i++)
1738     {
1739       /* tmp[0] = x^i */
1740       mpz_mul (tmp[1], F[i], tmp[0]);
1741       mpz_mod (tmp[1], tmp[1], modulus);
1742       mpz_add (r, r, tmp[1]);
1743 
1744       mpz_mul (tmp[1], tmp[0], x);
1745       mpz_mod (tmp[0], tmp[1], modulus);
1746     }
1747 
1748   if (monic)
1749     mpz_add (r, r, tmp[0]);
1750 
1751   mpz_mod (r, r, modulus);
1752 }
1753 #endif
1754 
1755 /* return the memory needed in the F[] array for poly_from_sets_V */
1756 static unsigned long
mem_poly_from_sets_V(sets_long_t * sets)1757 mem_poly_from_sets_V (sets_long_t *sets)
1758 {
1759   unsigned long c, deg, i, nr;
1760   set_long_t *set = sets->sets;
1761   unsigned long mem, maxmem = 0;
1762 
1763   deg = 1UL;
1764   for (nr = sets->nr - 1UL; nr > 0UL; nr--)
1765     {
1766       set = sets_nextset (sets->sets); /* Skip first set */
1767       for (i = 1UL; i < nr; i++) /* Skip over remaining sets but one */
1768         set = sets_nextset (set);
1769       c = set->card;
1770       if (c == 2UL)
1771         deg *= 2UL;
1772       else
1773         {
1774           i = (c - 1UL) / 2UL - 1; /* maximal value of i */
1775           /* list_scale_V needs 2*deg+1 entries starting at
1776              F + (2UL * i + 1UL) * (deg + 1UL) */
1777           mem = (2UL * i + 1UL) * (deg + 1UL) + (2 * deg + 1UL);
1778           if (mem > maxmem)
1779             maxmem = mem;
1780 	  deg *= c;
1781 	}
1782     }
1783 
1784   return maxmem;
1785 }
1786 
1787 /* Build a polynomial with roots r^2i, i in the sumset of the sets in "sets".
1788    The parameter Q = r + 1/r. This code uses the fact that the polynomials
1789    are symmetric. Requires that the first set in "sets" has cardinality 2,
1790    all sets must be symmetric around 0. The resulting polynomial of degree
1791    2*d is F(x) = f_0 + \sum_{1 <= i <= d} f_i (x^i + 1/x^i). The coefficient
1792    f_i is stored in F[i], which therefore needs d+1 elements. */
1793 
1794 static unsigned long
poly_from_sets_V(listz_t F,const mpres_t Q,sets_long_t * sets,listz_t tmp,const unsigned long tmplen,mpmod_t modulus,mpzspv_t dct,const mpzspm_t ntt_context)1795 poly_from_sets_V (listz_t F, const mpres_t Q, sets_long_t *sets,
1796 		  listz_t tmp, const unsigned long tmplen, mpmod_t modulus,
1797 		  mpzspv_t dct, const mpzspm_t ntt_context)
1798 {
1799   unsigned long c, deg, i, nr;
1800   set_long_t *set = sets->sets;
1801   mpres_t Qt;
1802 
1803   ASSERT_ALWAYS (sets->nr > 0UL);
1804   ASSERT_ALWAYS (set->card == 2UL); /* Check that the cardinality of
1805                                        first set is 2 */
1806   /* Check that first set is symmetric around 0 (we write card-1
1807      instead of 1 to avoid a compiler warning with clang 2.9) */
1808   ASSERT_ALWAYS (set->elem[0] == -set->elem[set->card - 1]);
1809 
1810   if (test_verbose (OUTPUT_TRACE))
1811     {
1812       mpz_t t;
1813       mpz_init (t);
1814       mpres_get_z (t, Q, modulus);
1815       outputf (OUTPUT_TRACE, "poly_from_sets_V (F, Q = %Zd, sets)\n", t);
1816       mpz_clear (t);
1817     }
1818 
1819   mpres_init (Qt, modulus);
1820 
1821   outputf (OUTPUT_DEVVERBOSE, " (processing set of size 2");
1822 
1823   V (Qt, Q, set->elem[0], modulus); /* First set in sets is {-k, k} */
1824   V (Qt, Qt, 2UL, modulus);         /* Qt = V_2k(Q) */
1825 
1826   mpres_neg (Qt, Qt, modulus);
1827   mpres_get_z (F[0], Qt, modulus);
1828   mpz_set_ui (F[1], 1UL);
1829   deg = 1UL;
1830   /* Here, F(x) = (x - r^{2k_1})(x - r^{-2k_1}) / x =
1831                   (x^2 - x (r^{2k_1} + r^{-2k_1}) + 1) / x =
1832 		  (x + 1/x) - V_{2k_1}(r + 1/r) */
1833 
1834   for (nr = sets->nr - 1UL; nr > 0UL; nr--)
1835     {
1836       /* Assuming the sets are sorted in order of ascending cardinality,
1837          we process them back-to-front so the sets of cardinality 2 are
1838          processed last, but skipping the first set which we processed
1839          already. */
1840 
1841       set = sets_nextset (sets->sets); /* Skip first set */
1842       for (i = 1UL; i < nr; i++) /* Skip over remaining sets but one */
1843         set = sets_nextset (set);
1844 
1845       /* Process this set. We assume it is either of cardinality 2, or of
1846 	 odd cardinality */
1847       c = set->card;
1848       outputf (OUTPUT_DEVVERBOSE, " %lu", c);
1849 
1850       if (c == 2UL)
1851 	{
1852 	  /* Check it's symmetric (we write c-1 instead of 2 to avoid a
1853            compiler warning with clang 2.9) */
1854 	  ASSERT_ALWAYS (set->elem[0] == -set->elem[c - 1]);
1855 	  V (Qt, Q, set->elem[0], modulus);
1856 	  V (Qt, Qt, 2UL, modulus);
1857 	  list_scale_V (F, F, Qt, deg, modulus, tmp, tmplen, dct,
1858 	                ntt_context);
1859 	  deg *= 2UL;
1860 	  ASSERT_ALWAYS (mpz_cmp_ui (F[deg], 1UL) == 0); /* Check it's monic */
1861 	}
1862       else
1863 	{
1864 	  ASSERT_ALWAYS (c % 2UL == 1UL);
1865 	  ASSERT_ALWAYS (set->elem[(c - 1UL) / 2UL] == 0UL);
1866 	  /* Generate the F(Q^{2k_i} * X)*F(Q^{-2k_i} * X) polynomials.
1867 	     Each is symmetric of degree 2*deg, so each has deg+1 coeffients
1868 	     in standard basis. */
1869 	  for (i = 0UL; i < (c - 1UL) / 2UL; i++)
1870 	    {
1871               /* Check it's symmetric */
1872 	      ASSERT_ALWAYS (set->elem[i] == -set->elem[c - 1L - i]);
1873 	      V (Qt, Q, set->elem[i], modulus);
1874 	      V (Qt, Qt, 2UL, modulus);
1875 	      ASSERT (mpz_cmp_ui (F[deg], 1UL) == 0); /* Check it's monic */
1876 	      list_scale_V (F + (2UL * i + 1UL) * (deg + 1UL), F, Qt, deg,
1877 	                    modulus, tmp, tmplen, dct, ntt_context);
1878 	      ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * (deg + 1UL) + 2UL * deg],
1879 	              1UL) == 0); /* Check it's monic */
1880 	    }
1881 	  /* Multiply the polynomials */
1882 	  for (i = 0UL; i < (c - 1UL) / 2UL; i++)
1883 	    {
1884 	      /* So far, we have the product
1885 		 F(X) * F(Q^{2k_j} * X) * F(Q^{-2k_j} * X), 1 <= j <= i,
1886 		 at F. This product has degree 2 * deg + i * 4 * deg, that is
1887 		 (2 * i + 1) * 2 * deg, which means (2 * i + 1) * deg + 1
1888 		 coefficients in F[0 ... (i * 2 + 1) * deg]. */
1889 	      ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * deg], 1UL) == 0);
1890 	      ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * (deg + 1UL) + 2UL*deg],
1891 	                          1UL) == 0);
1892 	      list_output_poly (F, (2UL * i + 1UL) * deg + 1, 0, 1,
1893 				"poly_from_sets_V: Multiplying ", "\n",
1894 				OUTPUT_TRACE);
1895 	      list_output_poly (F + (2UL * i + 1UL) * (deg + 1UL),
1896 	                        2UL * deg + 1UL, 0, 1, " and ", "\n",
1897 	                        OUTPUT_TRACE);
1898 	      list_mul_reciprocal (F,
1899 		                   F, (2UL * i + 1UL) * deg + 1UL,
1900 			 	   F + (2UL * i + 1UL) * (deg + 1UL),
1901 				   2UL * deg + 1UL, modulus->orig_modulus,
1902 				   tmp, tmplen);
1903 	      list_mod (F, F, (2UL * i + 3UL) * deg + 1UL,
1904 	                modulus->orig_modulus);
1905 	      list_output_poly (F, (2UL * i + 3UL) * deg + 1UL, 0, 1,
1906                                 " = ", "\n", OUTPUT_TRACE);
1907 	      ASSERT (mpz_cmp_ui (F[(2UL * i + 3UL) * deg], 1UL) == 0);
1908 	    }
1909 	  deg *= c;
1910 	}
1911     }
1912 
1913   mpres_clear (Qt, modulus);
1914   outputf (OUTPUT_DEVVERBOSE, ")");
1915 
1916   return deg;
1917 }
1918 
1919 static int
build_F_ntt(listz_t F,const mpres_t P_1,sets_long_t * S_1,const faststage2_param_t * params,mpmod_t modulus)1920 build_F_ntt (listz_t F, const mpres_t P_1, sets_long_t *S_1,
1921 	     const faststage2_param_t *params, mpmod_t modulus)
1922 {
1923   mpzspm_t F_ntt_context;
1924   mpzspv_t F_ntt;
1925   unsigned long tmplen;
1926   listz_t tmp;
1927   long timestart, realstart;
1928   unsigned long i;
1929 
1930   timestart = cputime ();
1931   realstart = realtime ();
1932 
1933   /* Precompute the small primes, primitive roots and inverses etc. for
1934      the NTT. The code to multiply wants a 3*k-th root of unity, where
1935      k is the smallest power of 2 with k > s_1/2 */
1936 
1937   F_ntt_context = mpzspm_init (3UL << ceil_log2 (params->s_1 / 2 + 1),
1938 			       modulus->orig_modulus);
1939   if (F_ntt_context == NULL)
1940     {
1941       outputf (OUTPUT_ERROR, "Could not initialise F_ntt_context, "
1942                "presumably out of memory\n");
1943       return ECM_ERROR;
1944     }
1945 
1946   print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for building F = ",
1947 		    F_ntt_context);
1948 
1949   outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");
1950 
1951   tmplen = params->s_1 + 100;
1952   tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
1953   F_ntt = mpzspv_init (1UL << ceil_log2 (params->s_1 / 2 + 1), F_ntt_context);
1954 
1955   i = poly_from_sets_V (F, P_1, S_1, tmp, tmplen, modulus, F_ntt,
1956                         F_ntt_context);
1957   ASSERT_ALWAYS(2 * i == params->s_1);
1958   ASSERT_ALWAYS(mpz_cmp_ui (F[i], 1UL) == 0);
1959 
1960   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
1961   if (test_verbose (OUTPUT_TRACE))
1962     {
1963       for (i = 0; i < params->s_1 / 2 + 1; i++)
1964 	outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
1965       outputf (OUTPUT_TRACE, "f(x) = f_0");
1966       for (i = 1; i < params->s_1 / 2 + 1; i++)
1967 	outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
1968       outputf (OUTPUT_TRACE, "/* PARI */ \n");
1969     }
1970 
1971   clear_list (tmp, tmplen);
1972   tmp = NULL;
1973   mpzspv_clear (F_ntt, F_ntt_context);
1974   F_ntt = NULL;
1975   mpzspm_clear (F_ntt_context);
1976   F_ntt_context = NULL;
1977 
1978   return 0;
1979 }
1980 
1981 /* Compute g_i = x_0^{M-i} * r^{(M-i)^2} for 0 <= i < l.
1982    x_0 = b_1^{2*k_2 + (2*m_1 + 1) * P}. r = b_1^P.
1983    Stores the result in g[0 ... l] and/or in g_ntt[offset ... offset + l] */
1984 
1985 static void
pm1_sequence_g(listz_t g_mpz,mpzspv_t g_ntt,const mpres_t b_1,const unsigned long P,const long M_param,const unsigned long l_param,const mpz_t m_1,const long k_2,mpmod_t modulus_param,const mpzspm_t ntt_context)1986 pm1_sequence_g (listz_t g_mpz, mpzspv_t g_ntt, const mpres_t b_1,
1987                 const unsigned long P, const long M_param,
1988 		const unsigned long l_param, const mpz_t m_1, const long k_2,
1989 		mpmod_t modulus_param, const mpzspm_t ntt_context)
1990 {
1991   mpres_t r[3], x_0, x_Mi;
1992   mpz_t t;
1993   unsigned long i;
1994   long timestart, realstart;
1995   long M = M_param;
1996   unsigned long l = l_param, offset = 0UL;
1997   mpmod_t modulus;
1998   int want_output = 1;
1999 
2000   outputf (OUTPUT_VERBOSE, "Computing g_i");
2001   outputf (OUTPUT_DEVVERBOSE, "\npm1_sequence_g: P = %lu, M_param = %lu, "
2002            "l_param = %lu, m_1 = %Zd, k_2 = %lu\n",
2003 	   P, M_param, l_param, m_1, k_2);
2004   timestart = cputime ();
2005   realstart = realtime ();
2006 
2007 #ifdef _OPENMP
2008 #pragma omp parallel if (l > 100) private(r, x_0, x_Mi, t, i, M, l, offset, modulus, want_output)
2009   {
2010     /* When multi-threading, we adjust the parameters for each thread */
2011 
2012     const int nr_chunks = omp_get_num_threads();
2013     const int thread_nr = omp_get_thread_num();
2014 
2015     l = (l_param - 1) / nr_chunks + 1; /* = ceil(l_param / nr_chunks) */
2016     offset = thread_nr * l;
2017     if (offset <= l_param)
2018       l = MIN(l, l_param - offset);
2019     else
2020       l = 0;
2021     outputf (OUTPUT_DEVVERBOSE,
2022              "pm1_sequence_g: thread %d has l = %lu, offset = %lu.\n",
2023              thread_nr, l, offset);
2024     M = M_param - (long) offset;
2025 
2026     /* Let only the master thread print stuff */
2027     want_output = (thread_nr == 0);
2028 
2029     if (want_output)
2030       outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
2031 #endif
2032 
2033   /* Make a private copy of the mpmod_t struct */
2034   mpmod_init_set (modulus, modulus_param);
2035 
2036   mpz_init (t);
2037   mpres_init (r[0], modulus);
2038   mpres_init (r[1], modulus);
2039   mpres_init (r[2], modulus);
2040   mpres_init (x_0, modulus);
2041   mpres_init (x_Mi, modulus);
2042 
2043   if (want_output)
2044     {
2045       if (test_verbose (OUTPUT_TRACE))
2046 	{
2047 	  mpres_get_z (t, b_1, modulus);
2048 	  outputf (OUTPUT_TRACE, "\n/* pm1_sequence_g */ N = %Zd; "
2049 		   "b_1 = Mod(%Zd, N); /* PARI */\n", modulus->orig_modulus, t);
2050 	  outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ P = %lu; M = %ld; "
2051 		   "m_1 = %Zd; /* PARI */\n", P, M, m_1);
2052 	  outputf (OUTPUT_TRACE,
2053 		   "/* pm1_sequence_g */ r = b_1^P; /* PARI */\n");
2054 	  outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ x_0 = "
2055 		   "b_1^(2*%ld + (2*m_1 + 1)*P); /* PARI */\n", k_2);
2056 	}
2057     }
2058 
2059   /* We use (M-(i+1))^2 = (M-i)^2 + 2*(-M+i) + 1 */
2060   mpz_set_ui (t, P);
2061   mpres_pow (r[0], b_1, t, modulus);     /* r[0] = b_1^P = r */
2062   if (test_verbose (OUTPUT_TRACE))
2063     {
2064       mpres_get_z (t, r[0], modulus);
2065       outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ r == %Zd /* PARI C */\n", t);
2066     }
2067 
2068   /* FIXME: This is a huge mess, clean up some time */
2069 
2070   mpz_set_si (t, M);
2071   mpz_neg (t, t);
2072   mpz_mul_2exp (t, t, 1UL);
2073   mpz_add_ui (t, t, 1UL);
2074   /* Warning: t might be negative here. */
2075   mpres_pow (r[1], r[0], t, modulus);    /* r[1] = r^{2(-M+i)+1}, i = 0 */
2076   mpz_set_si (t, M);
2077   mpz_mul (t, t, t);                     /* t = M^2 */
2078   mpres_pow (r[2], r[0], t, modulus);    /* r[2] = r^{(M-i)^2}, i = 0 */
2079   mpres_sqr (r[0], r[0], modulus); /* r[0] = r^2 */
2080 
2081   mpz_mul_2exp (t, m_1, 1UL);
2082   mpz_add_ui (t, t, 1UL);
2083   mpz_mul_ui (t, t, P);
2084   mpz_add_si (t, t, k_2);
2085   mpz_add_si (t, t, k_2);
2086   if (want_output)
2087     outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ 2*%ld + (2*%Zd + 1)*P == "
2088 	     "%Zd /* PARI C */\n", k_2, m_1, t);
2089 
2090   /* Warning: t might be negative here. */
2091   mpres_pow (x_0, b_1, t, modulus);  /* x_0 = b_1^{2*k_2 + (2*m_1 + 1)*P} */
2092   if (want_output && test_verbose (OUTPUT_TRACE))
2093     {
2094       mpres_get_z (t, x_0, modulus);
2095       outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ x_0 == %Zd /* PARI C */\n",
2096 	       t);
2097     }
2098 
2099   mpz_set_si (t, M);
2100   mpres_pow (x_Mi, x_0, t, modulus); /* x_Mi = x_0^{M-i}, i = 0 */
2101 
2102   mpres_invert (x_0, x_0, modulus);  /* x_0 := x_0^{-1} now */
2103   mpres_mul (r[1], r[1], x_0, modulus); /* r[1] = x_0^{-1} * r^{-2M+1} */
2104 
2105   mpres_mul (r[2], r[2], x_Mi, modulus); /* r[2] = x_0^M * r^{M^2} */
2106   mpres_get_z (t, r[2], modulus);
2107   outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd; /* PARI */\n",
2108 	   offset, t);
2109   if (l > 0)
2110     {
2111       if (g_mpz != NULL)
2112         mpz_set (g_mpz[offset], t);
2113       if (g_ntt != NULL)
2114         mpzspv_from_mpzv (g_ntt, offset, &t, 1UL, ntt_context);
2115     }
2116 
2117   /* So here we have for i = 0
2118      r[2] = x_0^(M-i) * r^{(M-i)^2}
2119      r[1] = x_0^{-1} * r^{2(-M+i)+1}
2120      r[0] = r^2
2121      t = r[2]
2122   */
2123 
2124   for (i = 1; i < l; i++)
2125     {
2126       if (g_mpz != NULL)
2127         {
2128 	  mpres_mul_z_to_z (g_mpz[offset + i], r[1], g_mpz[offset + i - 1],
2129 			    modulus);
2130 	  outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd;"
2131 		   " /* PARI */\n", offset + i, g_mpz[offset + i]);
2132         }
2133       if (g_ntt != NULL)
2134       {
2135 	  mpres_mul_z_to_z (t, r[1], t, modulus);
2136 	  if (g_mpz == NULL) /* Only one should be non-NULL... */
2137 	      outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd;"
2138 		       " /* PARI */\n", offset + i, t);
2139 	  mpzspv_from_mpzv (g_ntt, offset + i, &t, 1UL, ntt_context);
2140       }
2141       mpres_mul (r[1], r[1], r[0], modulus);
2142     }
2143 
2144   mpres_clear (r[0], modulus);
2145   mpres_clear (r[1], modulus);
2146   mpres_clear (r[2], modulus);
2147   mpres_clear (x_0, modulus);
2148   mpres_clear (x_Mi, modulus);
2149   mpz_clear (t);
2150   mpmod_clear (modulus); /* Clear our private copy of modulus */
2151 
2152 #ifdef _OPENMP
2153   }
2154 #endif
2155 
2156   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
2157 
2158   if (test_verbose (OUTPUT_TRACE))
2159     {
2160       for (i = 0; i < l_param; i++)
2161 	{
2162 	  outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu == x_0^"
2163 		   "(M - %lu) * r^((M - %lu)^2) /* PARI C */\n", i, i, i);
2164 	}
2165       outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g(x) = g_0");
2166       for (i = 1; i < l; i++)
2167 	outputf (OUTPUT_TRACE, " + g_%lu * x^%lu", i, i);
2168       outputf (OUTPUT_TRACE, " /* PARI */\n");
2169     }
2170 }
2171 
2172 
2173 /* Compute h_j = r^(-j^2) * f_j for 0 <= j < d as described in section 9
2174    of the paper. h == f is ok. */
2175 
2176 static void
pm1_sequence_h(listz_t h,mpzspv_t h_ntt,mpz_t * f,const mpres_t r,const unsigned long d,mpmod_t modulus_parm,const mpzspm_t ntt_context)2177 pm1_sequence_h (listz_t h, mpzspv_t h_ntt, mpz_t *f, const mpres_t r,
2178 		const unsigned long d, mpmod_t modulus_parm,
2179 		const mpzspm_t ntt_context)
2180 {
2181   mpres_t invr;  /* r^{-1}. Can be shared between threads */
2182   long timestart, realstart;
2183 
2184   mpres_init (invr, modulus_parm);
2185   mpres_invert (invr, r, modulus_parm); /* invr = r^{-1}. FIXME: test for
2186 					   failure, even if theoretically
2187 					   impossible */
2188 
2189   if (test_verbose (OUTPUT_TRACE))
2190     {
2191       mpz_t t;
2192       mpz_init (t);
2193       mpres_get_z (t, r, modulus_parm);
2194       outputf (OUTPUT_TRACE, "\n/* pm1_sequence_h */ N = %Zd; "
2195 	       "r = Mod(%Zd, N); /* PARI */\n",
2196 	       modulus_parm->orig_modulus, t);
2197       mpz_clear (t);
2198     }
2199 
2200   outputf (OUTPUT_VERBOSE, "Computing h");
2201   timestart = cputime ();
2202   realstart = realtime ();
2203 
2204 #ifdef _OPENMP
2205 #pragma omp parallel if (d > 100)
2206 #endif
2207   {
2208     mpres_t fd[3]; /* finite differences table for r^{-i^2}*/
2209     mpz_t t;       /* the h_j value as an mpz_t */
2210     unsigned long j;
2211     unsigned long offset = 0UL, len = d;
2212     mpmod_t modulus;
2213 
2214     /* Adjust offset and length for this thread */
2215 #ifdef _OPENMP
2216     {
2217       const int nr_chunks = omp_get_num_threads();
2218       const int thread_nr = omp_get_thread_num();
2219       unsigned long chunklen;
2220 
2221       if (thread_nr == 0)
2222 	outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
2223 
2224       chunklen = (len - 1UL) / (unsigned long) nr_chunks + 1UL;
2225       offset = chunklen * (unsigned long) thread_nr;
2226       if (offset <= len)
2227         len = MIN(chunklen, len - offset);
2228       else
2229         len = 0;
2230     }
2231 #endif
2232 
2233     mpmod_init_set (modulus, modulus_parm);
2234     mpres_init (fd[0], modulus);
2235     mpres_init (fd[1], modulus);
2236     mpres_init (fd[2], modulus);
2237     mpz_init (t);
2238 
2239     /* We have (n + 1)^2 = n^2 + 2n + 1. For the finite differences we'll
2240        need r^{-2}, r^{-(2n+1)}, r^{-n^2}. Init for n = 0. */
2241 
2242     /* r^{-2} in fd[0] is constant and could be shared. Computing it
2243        separately in each thread has the advantage of putting it in
2244        local memory. May not make much difference overall */
2245 
2246     mpres_sqr (fd[0], invr, modulus);       /* fd[0] = r^{-2} */
2247     mpz_set_ui (t, offset);
2248     mpz_mul_2exp (t, t, 1UL);
2249     mpz_add_ui (t, t, 1UL);                 /* t = 2 * offset + 1 */
2250     mpres_pow (fd[1], invr, t, modulus);    /* fd[1] = r^{-(2*offset+1)} */
2251     mpz_set_ui (t, offset);
2252     mpz_mul (t, t, t);                      /* t = offset^2 */
2253     mpres_pow (fd[2], invr, t, modulus);    /* fd[2] = r^{-offset^2} */
2254 
2255     /* Generate the sequence */
2256     for (j = offset; j < offset + len; j++)
2257       {
2258 	mpres_mul_z_to_z (t, fd[2], f[j], modulus);
2259 	outputf (OUTPUT_TRACE,
2260 		 "/* pm1_sequence_h */ h_%lu = %Zd; /* PARI */\n", j, t);
2261 
2262 	if (h != NULL)
2263 	  mpz_set (h[j], t);
2264 	if (h_ntt != NULL)
2265 	  mpzspv_from_mpzv (h_ntt, j, &t, 1UL, ntt_context);
2266 
2267 	mpres_mul (fd[2], fd[2], fd[1], modulus); /* fd[2] = r^{-j^2} */
2268 	mpres_mul (fd[1], fd[1], fd[0], modulus); /* fd[1] = r^{-2*j-1} */
2269       }
2270 
2271     mpres_clear (fd[2], modulus);
2272     mpres_clear (fd[1], modulus);
2273     mpres_clear (fd[0], modulus);
2274     mpz_clear (t);
2275     mpmod_clear (modulus);
2276   }
2277 
2278   mpres_clear (invr, modulus_parm);
2279 
2280   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
2281 
2282   if (test_verbose (OUTPUT_TRACE))
2283     {
2284       unsigned long j;
2285       for (j = 0; j < d; j++)
2286 	outputf (OUTPUT_TRACE, "/* pm1_sequence_h */ h_%lu == "
2287 		   "f_%lu * r^(-%lu^2) /* PARI C */\n", j, j, j);
2288       outputf (OUTPUT_TRACE, "/* pm1_sequence_h */ h(x) = h_0");
2289       for (j = 1; j < d; j++)
2290         outputf (OUTPUT_TRACE, " + h_%lu * (x^%lu + x^(-%lu))", j, j, j);
2291       outputf (OUTPUT_TRACE, " /* PARI */\n");
2292     }
2293 }
2294 
2295 
2296 static int
make_S_1_S_2(sets_long_t ** S_1,set_long_t ** S_2,const faststage2_param_t * params)2297 make_S_1_S_2 (sets_long_t **S_1, set_long_t **S_2,
2298               const faststage2_param_t *params)
2299 {
2300   unsigned long i;
2301   sets_long_t *facS_2;
2302   size_t facS_2_size;
2303 
2304   *S_1 = sets_get_factored_sorted (params->P);
2305   if (*S_1 == NULL)
2306     return ECM_ERROR;
2307 
2308   {
2309     mpz_t t1, t2;
2310 
2311     mpz_init (t1);
2312     mpz_init (t2);
2313     sets_sumset_minmax (t1, *S_1, 1);
2314     sets_max (t2, params->P);
2315     ASSERT_ALWAYS (mpz_cmp (t1, t2) == 0);
2316     mpz_clear (t1);
2317     mpz_clear (t2);
2318   }
2319 
2320   *S_2 = malloc (set_sizeof(params->s_2));
2321   if (*S_2 == NULL)
2322     {
2323       free (*S_1);
2324       return ECM_ERROR;
2325     }
2326 
2327   /* Extract sets for S_2 and compute the set of sums */
2328 
2329   sets_extract (NULL, &facS_2_size, *S_1, params->s_2);
2330   facS_2 = malloc (facS_2_size);
2331   if (facS_2 == NULL)
2332     {
2333       free (*S_1);
2334       free (*S_2);
2335       return ECM_ERROR;
2336     }
2337   sets_extract (facS_2, NULL, *S_1, params->s_2);
2338   sets_sumset (*S_2, facS_2);
2339   ASSERT_ALWAYS ((*S_2)->card == params->s_2);
2340   free (facS_2);
2341   quicksort_long ((*S_2)->elem, (*S_2)->card);
2342 
2343   /* Print the sets in devverbose mode */
2344   if (test_verbose (OUTPUT_DEVVERBOSE))
2345     {
2346       outputf (OUTPUT_DEVVERBOSE, "S_1 = ");
2347       sets_print (OUTPUT_DEVVERBOSE, *S_1);
2348 
2349       outputf (OUTPUT_DEVVERBOSE, "S_2 = {");
2350       for (i = 0UL; i + 1UL < params->s_2; i++)
2351 	outputf (OUTPUT_DEVVERBOSE, "%ld, ", (*S_2)->elem[i]);
2352       if (i < params->s_2)
2353 	outputf (OUTPUT_DEVVERBOSE, "%ld", (*S_2)->elem[i]);
2354       outputf (OUTPUT_DEVVERBOSE, "}\n");
2355     }
2356 
2357   return 0;
2358 }
2359 
2360 #if 0
2361 static mpzspv_t *
2362 mpzspv_init_mt (spv_size_t len, mpzspm_t mpzspm)
2363 {
2364   int i; /* OpenMP wants the iteration variable a signed type */
2365   mpzspv_t *x = (mpzspv_t *) malloc (mpzspm->sp_num * sizeof (spv_t *));
2366 
2367   if (x == NULL)
2368     return NULL;
2369 
2370   for (i = 0; i < (int) mpzspm->sp_num; i++)
2371     x[i] = NULL;
2372 
2373 #ifdef _OPENMP
2374 #pragma omp parallel private(i) shared(x)
2375   {
2376 #pragma omp for
2377 #endif
2378     for (i = 0; i < (int) mpzspm->sp_num; i++)
2379       x[i] = (spv_t *) sp_aligned_malloc (len * sizeof (sp_t));
2380 
2381 #ifdef _OPENMP
2382   }
2383 #endif
2384 
2385   for (i = 0; i < (int) mpzspm->sp_num; i++)
2386     if (x[i] == NULL)
2387       break;
2388 
2389   if (i != (int) mpzspm->sp_num) /* There is a NULL pointer */
2390     {
2391       for (i = 0; i < (int) mpzspm->sp_num; i++)
2392 	if (x[i] != NULL)
2393 	  sp_aligned_free(x[i]);
2394       return NULL;
2395     }
2396 
2397 #if 0
2398   if (test_verbose (OUTPUT_DEVVERBOSE))
2399     {
2400       spv_t * last = x[0];
2401       printf ("mpzspv_init_mt: x[0] = %p\n", x[0]);
2402       for (i = 1; i < (int) mpzspm->sp_num; i++)
2403         printf ("mpzspv_init_mt: x[%d] = %p, distance = %ld\n",
2404                 i, x[i], (long) (x[i] - x[i-1]));
2405     }
2406 #endif
2407 
2408   return x;
2409 }
2410 #endif
2411 
2412 /* Square the reciprocal Laurent polynomial S(x) of degree 2*n-2.
2413    S(x) = s_0 + \sum_{i=1}^{n-1} s_i (x^i + x^{-1}).
2414    S[i] contains the n coefficients s_i, 0 <= i <= n-1.
2415    R[i] will contain the 2n-1 coefficients r_i, 0 <= i <= 2*n-2, where
2416    R(x) = S(x)^2 = r_0 + \sum_{i=1}^{2n-2} r_i (x^i + x^{-1}).
2417    dft must have power of 2 length len >= 2n.
2418    The NTT primes must be == 1 (mod 3*len).
2419 */
2420 
2421 #undef TRACE_ntt_sqr_reciprocal
2422 static void
ntt_sqr_reciprocal(mpzv_t R,const mpzv_t S,mpzspv_t dft,const spv_size_t n,const mpzspm_t ntt_context)2423 ntt_sqr_reciprocal (mpzv_t R, const mpzv_t S, mpzspv_t dft,
2424 		    const spv_size_t n, const mpzspm_t ntt_context)
2425 {
2426 #ifdef WANT_ASSERT
2427   mpz_t S_eval_1, R_eval_1;
2428 #endif
2429 
2430   if (n == 0)
2431     return;
2432 
2433   if (n == 1)
2434     {
2435       mpz_mul (R[0], S[0], S[0]);
2436       mpz_mod (R[0], R[0], ntt_context->modulus);
2437       return;
2438     }
2439 
2440 #ifdef WANT_ASSERT
2441   mpz_init (S_eval_1);
2442   list_recip_eval1 (S_eval_1, S, n);
2443   /* Compute (S(1))^2 */
2444   mpz_mul (S_eval_1, S_eval_1, S_eval_1);
2445   mpz_mod (S_eval_1, S_eval_1, ntt_context->modulus);
2446 #endif
2447 
2448 #ifdef TRACE_ntt_sqr_reciprocal
2449   printf ("ntt_sqr_reciprocal: n %lu, length %lu\n", n, len);
2450   gmp_printf ("Input polynomial is %Zd", S[0]);
2451   {
2452     int j;
2453     for (j = 1; (spv_size_t) j < n; j++)
2454       gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", S[j], j, j);
2455   }
2456   printf ("\n");
2457 #endif
2458 
2459   /* Fill NTT elements [0 .. n-1] with coefficients */
2460   mpzspv_from_mpzv (dft, (spv_size_t) 0, S, n, ntt_context);
2461   mpzspv_sqr_reciprocal (dft, n, ntt_context);
2462 
2463 #if defined(_OPENMP)
2464 #pragma omp parallel if (n > 50)
2465 #endif
2466   {
2467     spv_size_t i, offset = 0, chunklen = 2*n - 1;
2468 
2469 #if defined(_OPENMP)
2470     {
2471       const int nr_chunks = omp_get_num_threads();
2472       const int thread_nr = omp_get_thread_num();
2473 
2474       chunklen = (chunklen - 1) / (spv_size_t) nr_chunks + 1;
2475       offset = (spv_size_t) thread_nr * chunklen;
2476       if (offset <= 2*n - 1)
2477         chunklen = MIN(chunklen, (2*n - 1) - offset);
2478       else
2479         chunklen = 0UL;
2480     }
2481 #endif
2482 
2483     mpzspv_to_mpzv (dft, offset, R + offset, chunklen, ntt_context);
2484     for (i = offset; i < offset + chunklen; i++)
2485       mpz_mod (R[i], R[i], ntt_context->modulus);
2486   }
2487 
2488 #ifdef TRACE_ntt_sqr_reciprocal
2489   gmp_printf ("ntt_sqr_reciprocal: Output polynomial is %Zd", R[0]);
2490   for (j = 1; (spv_size_t) j < 2*n - 1; j++)
2491     gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", R[j], j, j);
2492   printf ("\n");
2493 #endif
2494 
2495 #ifdef WANT_ASSERT
2496   mpz_init (R_eval_1);
2497   /* Compute (S^2)(1) and compare to (S(1))^2 */
2498   list_recip_eval1 (R_eval_1, R, 2 * n - 1);
2499   mpz_mod (R_eval_1, R_eval_1, ntt_context->modulus);
2500   if (mpz_cmp (R_eval_1, S_eval_1) != 0)
2501     {
2502       gmp_fprintf (stderr, "ntt_sqr_reciprocal: (S(1))^2 = %Zd but "
2503 		   "(S^2)(1) = %Zd\n", S_eval_1, R_eval_1);
2504 #if 0
2505       gmp_printf ("Output polynomial is %Zd", R[0]);
2506       for (j = 1; (spv_size_t) j < 2*n - 1; j++)
2507 	gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", R[j], j, j);
2508       printf ("\n");
2509 #endif
2510       abort ();
2511     }
2512   mpz_clear (S_eval_1);
2513   mpz_clear (R_eval_1);
2514 #endif
2515 }
2516 
2517 
2518 /* Computes gcd(\prod_{0 <= i < len} (ntt[i + offset] + add[i]), N),
2519    the NTT residues are converted to integer residues (mod N) first.
2520    If add == NULL, add[i] is assumed to be 0. */
2521 
2522 static void
ntt_gcd(mpz_t f,mpz_t * product,mpzspv_t ntt,const unsigned long ntt_offset,const listz_t add,const unsigned long len_param,const mpzspm_t ntt_context,mpmod_t modulus_param)2523 ntt_gcd (mpz_t f, mpz_t *product, mpzspv_t ntt, const unsigned long ntt_offset,
2524 	 const listz_t add, const unsigned long len_param,
2525 	 const mpzspm_t ntt_context, mpmod_t modulus_param)
2526 {
2527   unsigned long i, j;
2528   const unsigned long Rlen = MPZSPV_NORMALISE_STRIDE;
2529   listz_t R;
2530   unsigned long len = len_param, thread_offset = 0;
2531   mpres_t tmpres, tmpprod, totalprod;
2532   mpmod_t modulus;
2533   long timestart, realstart;
2534 
2535   outputf (OUTPUT_VERBOSE, "Computing gcd of coefficients and N");
2536   timestart = cputime ();
2537   realstart = realtime ();
2538 
2539   /* All the threads will multiply their partial products to this one. */
2540   mpres_init (totalprod, modulus_param);
2541   mpres_set_ui (totalprod, 1UL, modulus_param);
2542 
2543 #ifdef _OPENMP
2544 #pragma omp parallel if (len > 100) private(i, j, R, len, thread_offset, tmpres, tmpprod, modulus) shared(totalprod)
2545   {
2546     const int nr_chunks = omp_get_num_threads();
2547     const int thread_nr = omp_get_thread_num();
2548 
2549     len = (len_param - 1) / nr_chunks + 1;
2550     thread_offset = thread_nr * len;
2551     if (thread_offset <= len_param)
2552       len = MIN(len, len_param - thread_offset);
2553     else
2554       len = 0;
2555 #pragma omp master
2556     {
2557       outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
2558     }
2559 #endif
2560 
2561     /* Make a private copy of the mpmod_t struct */
2562     mpmod_init_set (modulus, modulus_param);
2563 
2564     R = init_list2 (Rlen, (mpz_size (modulus->orig_modulus) + 2) *
2565                            GMP_NUMB_BITS);
2566     mpres_init (tmpres, modulus);
2567     mpres_init (tmpprod, modulus);
2568     mpres_set_ui (tmpprod, 1UL, modulus);
2569 
2570     for (i = 0; i < len; i += Rlen)
2571       {
2572 	const unsigned long blocklen = MIN(len - i, Rlen);
2573 
2574 	/* Convert blocklen residues from NTT to integer representatives
2575 	   and store them in R */
2576 	mpzspv_to_mpzv (ntt, ntt_offset + thread_offset + i, R, blocklen,
2577 			ntt_context);
2578 
2579 	/* Accumulate product in tmpprod */
2580 	for (j = 0; j < blocklen; j++)
2581 	  {
2582 	    outputf (OUTPUT_TRACE, "r_%lu = %Zd; /* PARI */\n", i, R[j]);
2583 	    if (add != NULL)
2584 	      mpz_add (R[j], R[j], add[i + thread_offset + j]);
2585 	    mpres_set_z_for_gcd (tmpres, R[j], modulus);
2586 #define TEST_ZERO_RESULT
2587 #ifdef TEST_ZERO_RESULT
2588 	    if (mpres_is_zero (tmpres, modulus))
2589 	      outputf (OUTPUT_VERBOSE, "R_[%lu] = 0\n", i);
2590 #endif
2591 	    mpres_mul (tmpprod, tmpprod, tmpres, modulus);
2592 	  }
2593       }
2594 #ifdef _OPENMP
2595 #pragma omp critical
2596     {
2597       mpres_mul (totalprod, totalprod, tmpprod, modulus);
2598     }
2599 #else
2600     mpres_set (totalprod, tmpprod, modulus);
2601 #endif
2602     mpres_clear (tmpres, modulus);
2603     mpres_clear (tmpprod, modulus);
2604     mpmod_clear (modulus);
2605     clear_list (R, Rlen);
2606 #ifdef _OPENMP
2607   }
2608 #endif
2609 
2610   {
2611     mpz_t n;
2612     mpz_init (n);
2613     mpz_set_ui (n, len_param);
2614     mpres_set_z_for_gcd_fix (totalprod, totalprod, n, modulus_param);
2615     mpz_clear (n);
2616   }
2617 
2618   if (product != NULL)
2619     mpres_get_z (*product, totalprod, modulus_param);
2620 
2621   mpres_gcd (f, totalprod, modulus_param);
2622   mpres_clear (totalprod, modulus_param);
2623 
2624   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
2625 }
2626 
2627 
2628 int
pm1fs2(mpz_t f,const mpres_t X,mpmod_t modulus,const faststage2_param_t * params)2629 pm1fs2 (mpz_t f, const mpres_t X, mpmod_t modulus,
2630 	const faststage2_param_t *params)
2631 {
2632   unsigned long phiP, nr;
2633   unsigned long i, l, lenF, lenG, lenR, tmplen;
2634   sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
2635                        progressions of prime length */
2636   set_long_t *S_2; /* This is stored as a regular set */
2637   listz_t F;   /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
2638 		  degree s_1. It is symmetric, so has only s_1 / 2 + 1
2639 		  distinct coefficients. The sequence h_j will be stored in
2640 		  the same memory and won't be a monic polynomial, so the
2641 		  leading 1 monomial of F will be stored explicitly. Hence we
2642 		  need s_1 / 2 + 1 entries. */
2643   listz_t g, h, tmp, R;
2644   mpz_t mt;   /* All-purpose temp mpz_t */
2645   mpres_t mr; /* All-purpose temp mpres_t */
2646   int youpi = ECM_NO_FACTOR_FOUND;
2647   long timetotalstart, realtotalstart, timestart;
2648 
2649   timetotalstart = cputime ();
2650   realtotalstart = realtime ();
2651 
2652   phiP = eulerphi (params->P);
2653   ASSERT_ALWAYS (phiP == params->s_1 * params->s_2);
2654   ASSERT_ALWAYS (params->s_1 < params->l);
2655   nr = params->l - params->s_1; /* Number of points we evaluate */
2656 
2657   if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
2658       return ECM_ERROR;
2659 
2660   /* Allocate all the memory we'll need */
2661   /* Allocate the correct amount of space for each mpz_t or the
2662      reallocations will up to double the time for stage 2! */
2663   mpz_init (mt);
2664   mpres_init (mr, modulus);
2665   lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
2666 				     the leading 1 monomial for each factor */
2667   F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
2668   h = malloc ((params->s_1 + 1) * sizeof (mpz_t));
2669   if (h == NULL)
2670     {
2671       fprintf (stderr, "Cannot allocate memory in pm1fs2\n");
2672       exit (1);
2673     }
2674   lenG = params->l;
2675   g = init_list2 (lenG, (unsigned int) abs (modulus->bits));
2676   lenR = nr;
2677   R = init_list2 (lenR, (unsigned int) abs (modulus->bits));
2678   tmplen = 3UL * params->l + list_mul_mem (params->l / 2);
2679   outputf (OUTPUT_DEVVERBOSE, "tmplen = %lu\n", tmplen);
2680   if (TMulGen_space (params->l - 1, params->s_1, lenR) + 12 > tmplen)
2681     {
2682       tmplen = TMulGen_space (params->l - 1, params->s_1 - 1, lenR) + 12;
2683       /* FIXME: It appears TMulGen_space() returns a too small value! */
2684       outputf (OUTPUT_DEVVERBOSE, "With TMulGen_space, tmplen = %lu\n",
2685 	       tmplen);
2686     }
2687 #ifdef SHOW_TMP_USAGE
2688   tmp = init_list (tmplen);
2689 #else
2690   tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
2691 #endif
2692 
2693   mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
2694   outputf (OUTPUT_TRACE,
2695 	   "N = %Zd; X = Mod(%Zd, N); /* PARI */\n",
2696 	   modulus->orig_modulus, mt);
2697 
2698 
2699   /* Compute the polynomial f(x) = \prod_{k_1 in S_1} (x - X^{2k_1}) */
2700   outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");
2701 
2702   timestart = cputime ();
2703 
2704   /* First compute X + 1/X */
2705   mpres_invert (mr, X, modulus);
2706   mpres_add (mr, mr, X, modulus);
2707 
2708   i = poly_from_sets_V (F, mr, S_1, tmp, tmplen, modulus, NULL, NULL);
2709   ASSERT_ALWAYS(2 * i == params->s_1);
2710   ASSERT(mpz_cmp_ui (F[i], 1UL) == 0);
2711   free (S_1);
2712   S_1 = NULL;
2713 
2714   outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
2715   if (test_verbose (OUTPUT_TRACE))
2716     {
2717       for (i = 0; i < params->s_1 / 2 + 1; i++)
2718 	outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
2719       outputf (OUTPUT_TRACE, "f(x) = f_0");
2720       for (i = 1; i < params->s_1 / 2 + 1; i++)
2721 	outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
2722       outputf (OUTPUT_TRACE, "/* PARI */ \n");
2723     }
2724 
2725   mpz_set_ui (mt, params->P);
2726   mpres_pow (mr, X, mt, modulus); /* mr = X^P */
2727   pm1_sequence_h (F, NULL, F, mr, params->s_1 / 2 + 1, modulus, NULL);
2728 
2729   /* Make a symmetric copy of F in h. It will have length
2730      s_1 + 1 = 2*lenF - 1 */
2731   /* I.e. with F = [3, 2, 1], s_1 = 4, we want h = [1, 2, 3, 2, 1] */
2732   for (i = 0; i < params->s_1 / 2 + 1; i++)
2733     *(h[i]) = *(F[params->s_1 / 2 - i]); /* Clone the mpz_t. */
2734   for (i = 0; i < params->s_1 / 2; i++)
2735     *(h[i + params->s_1 / 2 + 1]) = *(F[i + 1]);
2736   if (test_verbose (OUTPUT_TRACE))
2737     {
2738       for (i = 0; i < params->s_1 + 1; i++)
2739         outputf (OUTPUT_VERBOSE, "h_%lu = %Zd; /* PARI */\n", i, h[i]);
2740       outputf (OUTPUT_VERBOSE, "h(x) = h_0");
2741       for (i = 1; i < params->s_1 + 1; i++)
2742         outputf (OUTPUT_VERBOSE, " + h_%lu * x^%lu", i, i);
2743       outputf (OUTPUT_VERBOSE, " /* PARI */\n");
2744     }
2745 
2746   for (l = 0; l < params->s_2; l++)
2747     {
2748       const unsigned long M = params->l - 1L - params->s_1 / 2L;
2749       outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
2750                l + 1, params->s_2);
2751       pm1_sequence_g (g, NULL, X, params->P, M, params->l,
2752 		      params->m_1, S_2->elem[l], modulus, NULL);
2753 
2754       /* Do the convolution */
2755       /* Use the transposed "Middle Product" algorithm */
2756       /* TMulGen reverses the first input sequence, but that doesn't matter
2757 	 since h is symmetric. */
2758 
2759       outputf (OUTPUT_VERBOSE, "TMulGen of g and h");
2760       timestart = cputime ();
2761       ASSERT(tmplen >= TMulGen_space (nr - 1, params->l - 1, params->s_1));
2762 
2763       /* Computes rev(h)*g, stores coefficients of x^(s_1) to
2764 	 x^(s_1+nr-1) = x^(len-1) */
2765       if (TMulGen (R, nr - 1, h, params->s_1, g, params->l - 1, tmp,
2766 		   modulus->orig_modulus) < 0)
2767 	{
2768 	  outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
2769 		   "of memory)\n");
2770 	  youpi = ECM_ERROR;
2771 	  break;
2772 	}
2773       list_mod (R, R, nr, modulus->orig_modulus);
2774 
2775       outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
2776 
2777 #if 0 && defined(WANT_ASSERT)
2778 
2779       /* See if R[i] is correct, with a test that works even if i0 != 0 */
2780       /* More expensive self-test */
2781       /* alpha = beta*(i0 + l*nr) */
2782       /* This code is old and probably does not work. */
2783 
2784       outputf (OUTPUT_VERBOSE, "Verifying all results (slow)");
2785       for (i = 0; i < nr; i++)
2786 	{
2787 	  mpz_set_ui (mt, nr * l);
2788 	  mpz_add (mt, mt, root_params->i0);
2789 	  mpz_add_ui (mt, mt, i);
2790 	  mpz_mul_ui (mt, mt, beta);
2791 	  mpres_get_z (tmp[0], X, modulus);
2792 	  mpz_powm (tmp[0], tmp[0], mt, modulus->orig_modulus);
2793 	  /* Hence, tmp[0] = X^(alpha + i * beta) */
2794 	  list_eval_poly (tmp[1], F, tmp[0], dF, 1, modulus->orig_modulus,
2795 			  tmp + 2);
2796 
2797 	  mpz_set_ui (mt, i);
2798 	  mpz_mul_ui (mt, mt, i);
2799 	  mpz_mul_ui (mt, mt, beta / 2); /* h(i) = beta*i^2/2 */
2800 	  mpres_get_z (tmp[0], X, modulus);
2801 	  mpz_powm (tmp[0], tmp[0], mt, modulus->orig_modulus); /* X^h(1) */
2802 	  mpz_mul (tmp[0], tmp[0], R[i]);
2803 	  mpz_mod (tmp[0], tmp[0], modulus->orig_modulus);
2804 	  if (mpz_cmp (tmp[0], tmp[1]) != 0)
2805 	    {
2806 	      outputf (OUTPUT_ERROR, "Result in R[%ld] incorrect.\n", i);
2807 	      outputf (OUTPUT_ERROR, "R[%ld] = %Zd\n", i, R[i]);
2808 	      abort ();
2809 	    }
2810 	}
2811       outputf (OUTPUT_VERBOSE, " - everything's correct! :-D\n");
2812 #endif
2813 
2814       if (test_verbose (OUTPUT_TRACE))
2815 	{
2816 	  for (i = 0; i < nr; i++)
2817 	    outputf (OUTPUT_TRACE, "r_%lu = %Zd; /* PARI */\n", i, R[i]);
2818 	}
2819 
2820       outputf (OUTPUT_VERBOSE, "Computing product of F(g_i)");
2821       timestart = cputime ();
2822 
2823       {
2824 	mpres_t tmpres, tmpprod;
2825 	mpres_init (tmpres, modulus);
2826 	mpres_init (tmpprod, modulus);
2827 	mpres_set_z_for_gcd (tmpprod, R[0], modulus);
2828 	for (i = 1; i < nr; i++)
2829 	  {
2830 	    mpres_set_z_for_gcd (tmpres, R[i], modulus);
2831 	    mpres_mul (tmpprod, tmpprod, tmpres, modulus);
2832 	  }
2833         mpres_get_z (tmp[1], tmpprod, modulus); /* For printing */
2834 	mpres_gcd (tmp[0], tmpprod, modulus);
2835 	mpres_clear (tmpprod, modulus);
2836 	mpres_clear (tmpres, modulus);
2837       }
2838 
2839       outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
2840       outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", tmp[1]);
2841 
2842       if (mpz_cmp_ui (tmp[0], 1UL) > 0)
2843 	{
2844 	  mpz_set (f, tmp[0]);
2845 	  youpi = ECM_FACTOR_FOUND_STEP2;
2846 	  break;
2847 	}
2848     }
2849 
2850 #ifdef SHOW_TMP_USAGE
2851   for (i = tmplen - 1; i > 0; i--)
2852     if (tmp[i]->_mp_alloc > 1)
2853       break;
2854   outputf (OUTPUT_DEVVERBOSE, "Highest used temp element is tmp[%lu]\n", i);
2855 #endif
2856 
2857   free (S_2);
2858   free (h);
2859   clear_list (F, lenF);
2860   clear_list (g, lenG);
2861   clear_list (R, lenR);
2862   clear_list (tmp, tmplen);
2863 
2864   mpz_clear (mt);
2865   mpres_clear (mr, modulus);
2866 
2867   outputf (OUTPUT_NORMAL, "Step 2");
2868   /* In normal output mode, print only cpu time as we always have.
2869      In verbose mode, print real time as well if we used multi-threading */
2870   if (test_verbose (OUTPUT_VERBOSE))
2871     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
2872   else
2873     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
2874 
2875   return youpi;
2876 }
2877 
2878 
2879 int
pm1fs2_ntt(mpz_t f,const mpres_t X,mpmod_t modulus,const faststage2_param_t * params)2880 pm1fs2_ntt (mpz_t f, const mpres_t X, mpmod_t modulus,
2881 	const faststage2_param_t *params)
2882 {
2883   unsigned long nr;
2884   unsigned long l, lenF;
2885   sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
2886                        progressions of prime length) */
2887   set_long_t *S_2; /* This is stored as a regular set */
2888   listz_t F;   /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
2889 		  degree s_1. It is symmetric, so has only s_1 / 2 + 1
2890 		  distinct coefficients. The sequence h_j will be stored in
2891 		  the same memory and won't be a monic polynomial, so the
2892 		  leading 1 monomial of F will be stored explicitly. Hence we
2893 		  need s_1 / 2 + 1 entries. */
2894   mpzspm_t ntt_context;
2895   mpzspv_t g_ntt, h_ntt;
2896   mpz_t mt;   /* All-purpose temp mpz_t */
2897   mpz_t product; /* Product of each multi-point evaluation */
2898   mpz_t *product_ptr = NULL;
2899   mpres_t tmpres; /* All-purpose temp mpres_t */
2900   int youpi = ECM_NO_FACTOR_FOUND;
2901   long timetotalstart, realtotalstart, timestart, realstart;
2902 
2903   timetotalstart = cputime ();
2904   realtotalstart = realtime ();
2905 
2906   ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
2907   ASSERT_ALWAYS (params->s_1 < params->l);
2908   nr = params->l - params->s_1; /* Number of points we evaluate */
2909 
2910   /* Prepare NTT for computing the h sequence, its DCT-I, and the convolution
2911      with g. We need NTT of transform length l. We do it here at the start
2912      of stage 2 so that in case of a "not enough primes" condition,
2913      we don't have to wait until after F is built to get the error. */
2914 
2915   ntt_context = mpzspm_init (params->l, modulus->orig_modulus);
2916   if (ntt_context == NULL)
2917     {
2918       outputf (OUTPUT_ERROR, "Could not initialise ntt_context, "
2919                "presumably out of memory\n");
2920       return ECM_ERROR;
2921     }
2922 
2923   print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for evaluation = ",
2924 		    ntt_context);
2925 
2926   if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
2927       return ECM_ERROR;
2928 
2929   /* Allocate all the memory we'll need for building f */
2930   mpz_init (mt);
2931   mpres_init (tmpres, modulus);
2932   lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
2933 				     the leading 1 monomial for each factor */
2934   {
2935     /* in some cases the above value of lenF is not enough, for example with
2936        s_1 = 10, which gives lenF = 7, but 9 entries are needed */
2937     unsigned long mem = mem_poly_from_sets_V (S_1);
2938     if (mem > lenF)
2939       lenF = mem;
2940   }
2941 
2942   F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
2943 
2944   mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
2945   outputf (OUTPUT_TRACE,
2946 	   "N = %Zd; X = Mod(%Zd, N); /* PARI */\n",
2947 	   modulus->orig_modulus, mt);
2948 
2949 #if 0 && defined (WANT_ASSERT)
2950   /* For this self test run with a large enough B2 so that enough memory
2951      is allocated for tmp and F_ntt, otherwise it segfaults. */
2952   {
2953     int testlen = 255;
2954     int i, j;
2955     /* A test of ntt_sqr_reciprocal() */
2956     for (j = 1; j <= testlen; j++)
2957       {
2958         outputf (OUTPUT_VERBOSE,
2959                  "Testing ntt_sqr_reciprocal() for input degree %d\n",
2960                  j - 1);
2961         for (i = 0; i < j; i++)
2962           mpz_set_ui (tmp[i], 1UL);
2963         ntt_sqr_reciprocal (tmp, tmp, F_ntt, (spv_size_t) j, ntt_context_F);
2964         for (i = 0; i < 2 * j - 1; i++)
2965           {
2966             ASSERT (mpz_cmp_ui (tmp[i], 2 * j - 1 - i) == 0);
2967           }
2968       }
2969     outputf (OUTPUT_VERBOSE,
2970              "Test of ntt_sqr_reciprocal() for input degree 2 ... %d passed\n",
2971              testlen - 1);
2972   }
2973 #endif
2974 
2975 
2976   /* First compute X + 1/X */
2977   mpres_invert (tmpres, X, modulus);
2978   mpres_add (tmpres, tmpres, X, modulus);
2979 
2980   if (build_F_ntt (F, tmpres, S_1, params, modulus) == ECM_ERROR)
2981     {
2982       free (S_1);
2983       free (S_2);
2984       mpz_clear (mt);
2985       mpres_clear (tmpres, modulus);
2986       mpzspm_clear (ntt_context);
2987       clear_list (F, lenF);
2988       return ECM_ERROR;
2989     }
2990 
2991   free (S_1);
2992   S_1 = NULL;
2993 
2994   h_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
2995 
2996   mpz_set_ui (mt, params->P);
2997   mpres_pow (tmpres, X, mt, modulus); /* tmpres = X^P */
2998   pm1_sequence_h (NULL, h_ntt, F, tmpres, params->s_1 / 2 + 1, modulus,
2999 		  ntt_context);
3000 
3001   clear_list (F, lenF);
3002   g_ntt = mpzspv_init (params->l, ntt_context);
3003 
3004   /* Compute the DCT-I of h */
3005   outputf (OUTPUT_VERBOSE, "Computing DCT-I of h");
3006 #ifdef _OPENMP
3007   outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
3008 #endif
3009   timestart = cputime ();
3010   realstart = realtime ();
3011 
3012   mpzspv_to_dct1 (h_ntt, h_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
3013                   g_ntt, ntt_context);
3014   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
3015 
3016   if (test_verbose (OUTPUT_RESVERBOSE))
3017     {
3018       mpz_init (product);
3019       product_ptr = &product;
3020     }
3021 
3022   for (l = 0; l < params->s_2; l++)
3023     {
3024       const unsigned long M = params->l - 1L - params->s_1 / 2L;
3025 
3026       outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
3027                l + 1, params->s_2);
3028       /* Compute the coefficients of the polynomial g(x) */
3029       pm1_sequence_g (NULL, g_ntt, X, params->P, M, params->l,
3030 		      params->m_1, S_2->elem[l], modulus, ntt_context);
3031 
3032       /* Do the convolution */
3033       outputf (OUTPUT_VERBOSE, "Computing g*h");
3034 #ifdef _OPENMP
3035       outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
3036 #endif
3037       timestart = cputime ();
3038       realstart = realtime ();
3039       mpzspv_mul_by_dct (g_ntt, h_ntt, params->l, ntt_context,
3040         NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
3041       print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
3042 
3043       /* Compute GCD of N and coefficients of product polynomial */
3044       ntt_gcd (mt, product_ptr, g_ntt, params->s_1 / 2, NULL, nr, ntt_context,
3045 	       modulus);
3046 
3047       outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", product);
3048 
3049       /* If we found a factor, stop */
3050       if (mpz_cmp_ui (mt, 1UL) > 0)
3051 	{
3052 	  mpz_set (f, mt);
3053 	  youpi = ECM_FACTOR_FOUND_STEP2;
3054 	  break;
3055 	}
3056     }
3057 
3058   if (test_verbose (OUTPUT_RESVERBOSE))
3059     {
3060       product_ptr = NULL;
3061       mpz_clear (product);
3062     }
3063   mpzspv_clear (g_ntt, ntt_context);
3064   mpzspv_clear (h_ntt, ntt_context);
3065   mpzspm_clear (ntt_context);
3066   mpres_clear (tmpres, modulus);
3067   mpz_clear (mt);
3068   free (S_2);
3069 
3070   outputf (OUTPUT_NORMAL, "Step 2");
3071   /* In normal output mode, print only cpu time as we always have.
3072      In verbose mode, print real time as well if we used multi-threading */
3073   if (test_verbose (OUTPUT_VERBOSE))
3074     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
3075   else
3076     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
3077 
3078   return youpi;
3079 }
3080 
3081 
3082 static void
gfp_ext_print(const mpres_t r_x,const mpres_t r_y,mpmod_t modulus,const int verbose)3083 gfp_ext_print (const mpres_t r_x, const mpres_t r_y, mpmod_t modulus,
3084 	       const int verbose)
3085 {
3086   mpz_t t1, t2;
3087 
3088   if (!test_verbose (verbose))
3089     return;
3090 
3091   mpz_init (t1);
3092   mpz_init (t2);
3093   mpres_get_z (t1, r_x, modulus);
3094   mpres_get_z (t2, r_y, modulus);
3095   outputf (verbose, "Mod(%Zd, N) + Mod(%Zd, N) * w", t1, t2);
3096 
3097   mpz_clear (t1);
3098   mpz_clear (t2);
3099 }
3100 
3101 
3102 
3103 /* Multiplies (a_0 + a_1*sqrt(Delta)) * (b_0 + b_1*sqrt(Delta))
3104    using four multiplications. Result goes in (r_0 + r_1*sqrt(Delta)).
3105    a_0, b_0, r_0 as well as a_1, b_1, r_1 may overlap arbitrarily. t[0], t[1],
3106    t[2] and Delta must not overlap with anything. */
3107 /* FIXME: is there a faster multiplication routine if both inputs have
3108    norm 1? */
3109 
3110 static void
gfp_ext_mul(mpres_t r_0,mpres_t r_1,const mpres_t a_0,const mpres_t a_1,const mpres_t b_0,const mpres_t b_1,const mpres_t Delta,mpmod_t modulus,ATTRIBUTE_UNUSED const unsigned long tmplen,mpres_t * tmp)3111 gfp_ext_mul (mpres_t r_0, mpres_t r_1, const mpres_t a_0, const mpres_t a_1,
3112 	     const mpres_t b_0, const mpres_t b_1, const mpres_t Delta,
3113 	     mpmod_t modulus, ATTRIBUTE_UNUSED const unsigned long tmplen,
3114 	     mpres_t *tmp)
3115 {
3116   ASSERT (tmplen >= 2);
3117   if (0 && test_verbose (OUTPUT_TRACE))
3118     {
3119       mpz_t t;
3120       mpz_init (t);
3121       mpres_get_z (t, Delta, modulus);
3122       outputf (OUTPUT_TRACE, "/* gfp_ext_mul */ w = quadgen (4*%Zd); "
3123                "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
3124       mpz_clear (t);
3125       outputf (OUTPUT_TRACE, "/* gfp_ext_mul */ (");
3126       gfp_ext_print (a_0, a_1, modulus, OUTPUT_TRACE);
3127       outputf (OUTPUT_TRACE, ") * (");
3128       gfp_ext_print (b_0, b_1, modulus, OUTPUT_TRACE);
3129     }
3130 
3131   mpres_add (tmp[0], a_0, a_1, modulus);
3132   mpres_add (tmp[1], b_0, b_1, modulus);
3133   mpres_mul (tmp[1], tmp[0], tmp[1], modulus); /* t[1] = (a_0+a_1)*(b_0+b_1) =
3134 					    a_0*b_0 + a_0*b_1 + a_1*b_0 +
3135 					    a_1*b_1 */
3136 
3137   mpres_mul (r_0, a_0, b_0, modulus);    /* r_0 = a_0*b_0. We don't need a_0
3138 					    or b_0 any more now */
3139   mpres_sub (tmp[1], tmp[1], r_0, modulus);  /* t[1] = a_0*b_1 + a_1*b_0 +
3140 						a_1*b_1 */
3141 
3142   mpres_mul (tmp[0], a_1, b_1, modulus);   /* t[0] = a_1*b_1. We don't need
3143 					      a_1 or b_1 any more now */
3144   mpres_sub (r_1, tmp[1], tmp[0], modulus);  /* r_1 == a_0*b_1 + a_1*b_0 */
3145 
3146   mpres_mul (tmp[0], tmp[0], Delta, modulus); /* t[0] = a_1*b_1*Delta */
3147   mpres_add (r_0, r_0, tmp[0], modulus);   /* r_0 = a_0*b_0 + a_1*b_1*Delta */
3148 
3149   if (0 && test_verbose (OUTPUT_TRACE))
3150     {
3151       outputf (OUTPUT_TRACE, ") == ");
3152       gfp_ext_print (r_0, r_1, modulus, OUTPUT_TRACE);
3153       outputf (OUTPUT_TRACE, " /* PARI C */\n");
3154     }
3155 }
3156 
3157 
3158 /* Computes (a_0 + a_1 * sqrt(Delta))^2, where the norm
3159    (a_0^2 - a_1^2*Delta) is assumed to be equal to 1. Hence
3160    (a_0 + a_1 * sqrt(Delta))^2 = a_0^2 + 2*a_0*a_1*sqrt(Delta) + a_1^2*Delta
3161    and a_0^2 + a_1^2*Delta = a_0^2 + a_1^2*Delta + norm - 1 = 2*a_0^2 - 1.
3162    a_0 and r_0, as well as a_1 and r_1 may overlap */
3163 
3164 static void
gfp_ext_sqr_norm1(mpres_t r_0,mpres_t r_1,const mpres_t a_0,const mpres_t a_1,mpmod_t modulus)3165 gfp_ext_sqr_norm1 (mpres_t r_0, mpres_t r_1, const mpres_t a_0,
3166 		   const mpres_t a_1, mpmod_t modulus)
3167 {
3168   ASSERT (a_0 != r_1);  /* a_0 is read after r_1 is written */
3169 
3170   if (pari)
3171     gmp_printf ("/* gfp_ext_sqr_norm1 */ (%Zd + %Zd * w)^2 %% N == ", a_0, a_1);
3172 
3173   mpres_mul (r_1, a_0, a_1, modulus);
3174   mpres_add (r_1, r_1, r_1, modulus);       /* r_1 = 2*a_0*a_1 */
3175 
3176   mpres_sqr (r_0, a_0, modulus);
3177   mpres_add (r_0, r_0, r_0, modulus);
3178   mpres_sub_ui (r_0, r_0, 1UL, modulus);    /* r_0 = 2*a_0^2 - 1 */
3179 
3180   if (pari)
3181     gmp_printf ("(%Zd + %Zd * w) %% N /* PARI C */\n", r_0, r_1);
3182 }
3183 
3184 
3185 /* Raise (a0 + a1*sqrt(Delta)) to the power e which is a signed long int.
3186    (a0 + a1*sqrt(Delta)) is assumed to have norm 1, i.e.
3187    a0^2 - a1^2*Delta == 1. The result is (r0 * r1*sqrt(Delta)).
3188    a0, a1, r0 and r1 must not overlap */
3189 
3190 static void
gfp_ext_pow_norm1_sl(mpres_t r0,mpres_t r1,const mpres_t a0,const mpres_t a1,const long e,const mpres_t Delta,mpmod_t modulus,unsigned long tmplen,mpres_t * tmp)3191 gfp_ext_pow_norm1_sl (mpres_t r0, mpres_t r1, const mpres_t a0,
3192                       const mpres_t a1, const long e, const mpres_t Delta,
3193                       mpmod_t modulus, unsigned long tmplen, mpres_t *tmp)
3194 {
3195   const unsigned long abs_e = labs (e);
3196   unsigned long mask = ~0UL - (~0UL >> 1);
3197 
3198   ASSERT (a0 != r0 && a1 != r0 && a0 != r1 && a1 != r1);
3199 
3200   if (e == 0)
3201     {
3202       mpres_set_ui (r0, 1UL, modulus);
3203       mpres_set_ui (r1, 0UL, modulus);
3204       return;
3205     }
3206 
3207   /* If e < 0, we want 1/(a0 + a1*sqrt(Delta)). By extending with
3208      a0 - a1*sqrt(Delta), we get
3209      (a0 - a1*sqrt(Delta)) / (a0^2 - a1^2 * Delta), but that denomiator
3210      is the norm which is known to be 1, so the result is
3211      a0 - a1*sqrt(Delta). */
3212 
3213   while ((abs_e & mask) == 0UL)
3214     mask >>= 1;
3215 
3216   mpres_set (r0, a0, modulus);
3217   mpres_set (r1, a1, modulus);
3218 
3219   while (mask > 1UL)
3220     {
3221       gfp_ext_sqr_norm1 (r0, r1, r0, r1, modulus);
3222       mask >>= 1;
3223       if (abs_e & mask)
3224 	gfp_ext_mul (r0, r1, r0, r1, a0, a1, Delta, modulus, tmplen, tmp);
3225     }
3226 
3227   if (e < 0)
3228     mpres_neg (r1, r1, modulus);
3229 
3230   if (0 && test_verbose (OUTPUT_TRACE))
3231     {
3232       mpz_t t;
3233       mpz_init (t);
3234       mpres_get_z (t, Delta, modulus);
3235       outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1_sl */ w = quadgen (4*%Zd); "
3236                "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
3237       mpz_clear (t);
3238       outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1_sl */ (");
3239       gfp_ext_print (a0, a1, modulus, OUTPUT_TRACE);
3240       outputf (OUTPUT_TRACE, ")^(%ld) == ", e);
3241       gfp_ext_print (r0, r1, modulus, OUTPUT_TRACE);
3242       outputf (OUTPUT_TRACE, " /* PARI C */\n");
3243     }
3244 }
3245 
3246 
3247 /* Same, but taking an mpz_t argument for the exponent */
3248 
3249 static void
gfp_ext_pow_norm1(mpres_t r0,mpres_t r1,const mpres_t a0,const mpres_t a1,mpz_t e,const mpres_t Delta,mpmod_t modulus,unsigned long tmplen,mpres_t * tmp)3250 gfp_ext_pow_norm1 (mpres_t r0, mpres_t r1, const mpres_t a0,
3251                    const mpres_t a1, mpz_t e, const mpres_t Delta,
3252                    mpmod_t modulus, unsigned long tmplen, mpres_t *tmp)
3253 {
3254   mpz_t abs_e;
3255   unsigned long idx;
3256 
3257   ASSERT (a0 != r0 && a1 != r0 && a0 != r1 && a1 != r1);
3258 
3259   if (mpz_sgn (e) == 0)
3260     {
3261       mpres_set_ui (r0, 1UL, modulus);
3262       mpres_set_ui (r1, 0UL, modulus);
3263       return;
3264     }
3265 
3266   mpz_init (abs_e);
3267   mpz_abs (abs_e, e);
3268   idx = mpz_sizeinbase (abs_e, 2) - 1; /* Thus mpz_tstbit (abs_e, idx) == 1 */
3269   ASSERT (mpz_tstbit (abs_e, idx) == 1);
3270 
3271   mpres_set (r0, a0, modulus);
3272   mpres_set (r1, a1, modulus);
3273 
3274   while (idx > 0UL)
3275     {
3276       gfp_ext_sqr_norm1 (r0, r1, r0, r1, modulus);
3277       idx--;
3278       if (mpz_tstbit (abs_e, idx))
3279 	gfp_ext_mul (r0, r1, r0, r1, a0, a1, Delta, modulus, tmplen, tmp);
3280     }
3281 
3282   if (mpz_sgn (e) < 0)
3283     mpres_neg (r1, r1, modulus);
3284 
3285   mpz_clear (abs_e);
3286 
3287   if (test_verbose (OUTPUT_TRACE))
3288     {
3289       mpz_t t;
3290       mpz_init (t);
3291       mpres_get_z (t, Delta, modulus);
3292       outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1 */ w = quadgen (4*%Zd); "
3293                "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
3294       mpz_clear (t);
3295       outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1 */ (");
3296       gfp_ext_print (a0, a1, modulus, OUTPUT_TRACE);
3297       outputf (OUTPUT_TRACE, ")^(%Zd) == ", e);
3298       gfp_ext_print (r0, r1, modulus, OUTPUT_TRACE);
3299       outputf (OUTPUT_TRACE, " /* PARI C */\n");
3300     }
3301 }
3302 
3303 #if 0
3304 /* Compute r[i] = a^((k+i)^2) for i = 0, 1, ..., l-1, where "a" is an
3305    element of norm 1 in the quadratic extension ring */
3306 
3307 static void
3308 gfp_ext_rn2 (mpres_t *r_x, mpres_t *r_y, const mpres_t a_x, const mpres_t a_y,
3309 	     const long k, const unsigned long l, const mpres_t Delta,
3310 	     mpmod_t modulus, const unsigned long origtmplen, mpres_t *origtmp)
3311 {
3312   mpres_t *r2_x = origtmp, *r2_y = origtmp + 2, *v = origtmp + 4,
3313     *V2 = origtmp + 6;
3314   const unsigned long newtmplen = origtmplen - 7;
3315   mpres_t *newtmp = origtmp + 7;
3316   unsigned long i;
3317 
3318   if (l == 0UL)
3319     return;
3320 
3321   ASSERT (origtmplen >= 8UL);
3322 
3323   if (pari)
3324     gmp_printf ("/* In gfp_ext_rn2 */ ; a = %Zd + %Zd * w; /* PARI */\n",
3325 		a_x, a_y, modulus->orig_modulus);
3326 
3327   /* Compute r[0] = a^(k^2). We do it by two exponentiations by k and use
3328      v[0] and v[1] as temp storage */
3329   gfp_ext_pow_norm1_sl (v[0], v[1], a_x, a_y, k, Delta, modulus, newtmplen,
3330 		     newtmp);
3331   gfp_ext_pow_norm1_sl (r_x[0], r_y[0], v[0], v[1], k, Delta, modulus,
3332 		     newtmplen, newtmp);
3333   if (pari)
3334     gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2) %% N == (%Zd + %Zd * w) %% N "
3335 		"/* PARI C */\n", k, r_x[0], r_y[0]);
3336 
3337   /* Compute r[1] = a^((k+1)^2) = a^(k^2 + 2k + 1)*/
3338   if (l > 1)
3339     {
3340       /* v[0] + v[1]*sqrt(Delta) still contains a^k */
3341       gfp_ext_sqr_norm1 (r_x[1], r_y[1], v[0], v[1], modulus);
3342       /* Now r[1] = a^(2k) */
3343       gfp_ext_mul (r_x[1], r_y[1], r_x[1], r_y[1], r_x[0], r_y[0], Delta,
3344 		   modulus, newtmplen, newtmp);
3345       /* Now r[1] = a^(k^2 + 2k) */
3346       gfp_ext_mul (r_x[1], r_y[1], r_x[1], r_y[1], a_x, a_y, Delta, modulus,
3347 		   newtmplen, newtmp);
3348       /* Now r[1] = a^(k^2 + 2k + 1) = a^((k+1)^2) */
3349     }
3350   if (pari)
3351     gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2) %% N == (%Zd + %Zd * w) %% N "
3352 		"/* PARI C */\n", k + 1, r_x[1], r_y[1]);
3353 
3354   /* Compute r2[0] = a^(k^2+2) = a^(k^2) * a^2 */
3355   gfp_ext_sqr_norm1 (v[0], v[1], a_x, a_y, modulus);
3356   gfp_ext_mul (r2_x[0], r2_y[0], r_x[0], r_y[0], v[0], v[1], Delta, modulus,
3357 	       newtmplen, newtmp);
3358   if (pari)
3359     gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2+2) %% N == (%Zd + %Zd * w) %% N "
3360 		"/* PARI C */\n", k, r2_x[0], r2_y[0]);
3361   /* Compute a^((k+1)^2+2) = a^((k+1)^2) * a^2 */
3362   gfp_ext_mul (r2_x[1], r2_y[1], r_x[1], r_y[1], v[0], v[1], Delta, modulus,
3363 	       newtmplen, newtmp);
3364   if (pari)
3365     gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2+2) %% N == (%Zd + %Zd * w) %% N "
3366 		"/* PARI C */\n", k + 1, r2_x[1], r2_y[1]);
3367 
3368   /* Compute V_2(a + 1/a). Since 1/a = a_x - a_y, we have a+1/a = 2*a_x.
3369      V_2(x) = x^2 - 2, so we want 4*a_x^2 - 2. */
3370   mpres_add (*V2, a_x, a_x, modulus); /* V2 = a + 1/a  = 2*a_x*/
3371   V (v[0], *V2, 2 * k + 1, modulus);  /* v[0] = V_{2k+1} (a + 1/a) */
3372   V (v[1], *V2, 2 * k + 3, modulus);  /* v[0] = V_{2k+3} (a + 1/a) */
3373   mpres_sqr (*V2, *V2, modulus);      /* V2 = 4*a_x^2 */
3374   mpres_sub_ui (*V2, *V2, 2UL, modulus); /* V2 = 4*a_x^2 - 2 */
3375   if (pari)
3376     {
3377       gmp_printf ("/* In gfp_ext_rn2 */ ((a + 1/a)^2 - 2) %% N == "
3378 		  "%Zd %% N /* PARI C */\n", *V2);
3379       gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
3380 		  "/* PARI C */\n", 2 * k + 1, v[0]);
3381       gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
3382 		  "/* PARI C */\n", 2 * k + 3, v[1]);
3383     }
3384 
3385   /* Compute the remaining a^((k+i)^2) values according to Peter's
3386      recurrence */
3387 
3388   for (i = 2; i < l; i++)
3389     {
3390       /* r[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i taken
3391 	 modulo 2 */
3392       mpres_mul (r_x[i], r2_x[1 - i % 2], v[i % 2], modulus);
3393       mpres_sub (r_x[i], r_x[i], r2_x[i % 2], modulus);
3394       mpres_mul (r_y[i], r2_y[1 - i % 2], v[i % 2], modulus);
3395       mpres_sub (r_y[i], r_y[i], r2_y[i % 2], modulus);
3396 
3397       /* r2[i] = r2[i-1] * v[i-1] - r[i-2] */
3398       mpres_mul (r2_x[i % 2], r2_x[1 - i % 2], v[1 - i % 2], modulus);
3399       mpres_sub (r2_x[i % 2], r2_x[i % 2], r_x[i - 2], modulus);
3400       mpres_mul (r2_y[i % 2], r2_y[1 - i % 2], v[1 - i % 2], modulus);
3401       mpres_sub (r2_y[i % 2], r2_y[i % 2], r_y[i - 2], modulus);
3402 
3403       /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
3404       mpres_mul (newtmp[0], v[1 - i % 2], *V2, modulus);
3405       mpres_sub (v[i % 2], newtmp[0], v[i % 2], modulus);
3406       if (pari)
3407 	gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
3408 		    "/* PARI C */\n", 2 * (k + i) + 1, v[i % 2]);
3409     }
3410 }
3411 #endif
3412 
3413 /* Compute g_i = x_0^{M-i} * r^{(M-i)^2} for 0 <= i < l.
3414    x_0 = b_1^{2*k_2 + (2*m_1 + 1) * P}. r = b_1^P. */
3415 
3416 static void
pp1_sequence_g(listz_t g_x,listz_t g_y,mpzspv_t g_x_ntt,mpzspv_t g_y_ntt,const mpres_t b1_x,const mpres_t b1_y,const unsigned long P,const mpres_t Delta,const long M_param,const unsigned long l_param,const mpz_t m_1,const long k_2,const mpmod_t modulus_param,const mpzspm_t ntt_context)3417 pp1_sequence_g (listz_t g_x, listz_t g_y, mpzspv_t g_x_ntt, mpzspv_t g_y_ntt,
3418 		const mpres_t b1_x, const mpres_t b1_y, const unsigned long P,
3419 		const mpres_t Delta, const long M_param,
3420 		const unsigned long l_param, const mpz_t m_1, const long k_2,
3421 		const mpmod_t modulus_param, const mpzspm_t ntt_context)
3422 {
3423   const unsigned long tmplen = 3;
3424   const int want_x = (g_x != NULL || g_x_ntt != NULL);
3425   const int want_y = (g_y != NULL || g_y_ntt != NULL);
3426   mpres_t r_x, r_y, x0_x, x0_y, v2,
3427       r1_x[2], r1_y[2], r2_x[2], r2_y[2],
3428       v[2], tmp[3];
3429   mpz_t mt;
3430   mpmod_t modulus; /* Thread-local copy of modulus_param */
3431   unsigned long i, l = l_param, offset = 0;
3432   long M = M_param;
3433   long timestart, realstart;
3434   int want_output = 1;
3435 
3436   outputf (OUTPUT_VERBOSE, "Computing %s%s%s",
3437 	   (want_x) ? "g_x" : "",
3438 	   (want_x && want_y) ? " and " : "",
3439 	   (want_y) ? "g_y" : "");
3440   timestart = cputime ();
3441   realstart = realtime ();
3442 
3443 #ifdef _OPENMP
3444 #pragma omp parallel if (l > 100) private(r_x, r_y, x0_x, x0_y, v2, r1_x, r1_y, r2_x, r2_y, v, tmp, mt, modulus, i, l, offset, M, want_output)
3445   {
3446     /* When multi-threading, we adjust the parameters for each thread */
3447 
3448     const int nr_chunks = omp_get_num_threads();
3449     const int thread_nr = omp_get_thread_num();
3450 
3451     l = (l_param - 1) / nr_chunks + 1;
3452     offset = thread_nr * l;
3453     if (offset <= l_param)
3454       l = MIN(l, l_param - offset);
3455     else
3456       l = 0;
3457     M = M_param - (long) offset;
3458 
3459     want_output = (omp_get_thread_num() == 0);
3460     if (want_output)
3461       outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
3462 #endif
3463     mpmod_init_set (modulus, modulus_param);
3464     mpres_init (r_x, modulus);
3465     mpres_init (r_y, modulus);
3466     mpres_init (x0_x, modulus);
3467     mpres_init (x0_y, modulus);
3468     mpres_init (v2, modulus);
3469     for (i = 0; i < 2UL; i++)
3470       {
3471 	mpres_init (r1_x[i], modulus);
3472 	mpres_init (r1_y[i], modulus);
3473 	mpres_init (r2_x[i], modulus);
3474 	mpres_init (r2_y[i], modulus);
3475 	mpres_init (v[i], modulus);
3476       }
3477     for (i = 0; i < tmplen; i++)
3478       mpres_init (tmp[i], modulus);
3479     mpz_init (mt);
3480 
3481     if (want_output && test_verbose (OUTPUT_TRACE))
3482       {
3483 	mpres_get_z (mt, Delta, modulus);
3484 	outputf (OUTPUT_TRACE,
3485 		 "\n/* pp1_sequence_g */ w = quadgen (4*%Zd); P = %lu; "
3486 		 "M = %ld; k_2 = %ld; m_1 = %Zd; N = %Zd; /* PARI */\n",
3487 		 mt, P, M, k_2, m_1, modulus->orig_modulus);
3488 
3489 	outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ b_1 = ");
3490 	gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
3491 	outputf (OUTPUT_TRACE, "; /* PARI */\n");
3492 	outputf (OUTPUT_TRACE,
3493 		 "/* pp1_sequence_g */ r = b_1^P; /* PARI */\n");
3494 	outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ "
3495 		 "x_0 = b_1^(2*k_2 + (2*m_1 + 1) * P); /* PARI */\n");
3496 	outputf (OUTPUT_TRACE,
3497 		 "/* pp1_sequence_g */ addrec(x) = x + 1/x; /* PARI */\n");
3498       }
3499 
3500     /* Compute r */
3501     gfp_ext_pow_norm1_sl (r_x, r_y, b1_x, b1_y, P, Delta, modulus,
3502 			  tmplen, tmp);
3503     if (want_output && test_verbose (OUTPUT_TRACE))
3504       {
3505 	outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ r == ");
3506 	gfp_ext_print (r_x, r_y, modulus, OUTPUT_TRACE);
3507 	outputf (OUTPUT_TRACE, " /* PARI C */\n");
3508       }
3509 
3510     /* Compute x0 = x_0 */
3511     mpz_mul_2exp (mt, m_1, 1UL);
3512     mpz_add_ui (mt, mt, 1UL);
3513     mpz_mul_ui (mt, mt, P);
3514     mpz_add_si (mt, mt, k_2);
3515     mpz_add_si (mt, mt, k_2); /* mt = 2*k_2 + (2*m_1 + 1) * P */
3516     gfp_ext_pow_norm1 (x0_x, x0_y, b1_x, b1_y, mt, Delta, modulus,
3517 		       tmplen, tmp);
3518     if (want_output && test_verbose (OUTPUT_TRACE))
3519       {
3520 	outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ x_0 == ");
3521 	gfp_ext_print (x0_x, x0_y, modulus, OUTPUT_TRACE);
3522 	outputf (OUTPUT_TRACE, " /* PARI C */\n");
3523       }
3524 
3525 
3526     /* Compute g[1] = r1[0] = x0^M * r^(M^2) = (x0 * r^M)^M.
3527        We use v[0,1] as temporary storage */
3528     gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, M, Delta, modulus,
3529 			  tmplen, tmp); /* v[0,1] = r^M */
3530     gfp_ext_mul (v[0], v[1], v[0], v[1], x0_x, x0_y, Delta, modulus,
3531 		 tmplen, tmp); /* v[0,1] = r^M * x_0 */
3532     gfp_ext_pow_norm1_sl (r1_x[0], r1_y[0], v[0], v[1], M, Delta, modulus,
3533 			  tmplen, tmp); /* r1[0] = (r^M * x_0)^M */
3534     if (l > 0)
3535       {
3536         if (g_x != NULL)
3537           mpres_get_z (g_x[offset], r1_x[0], modulus);
3538         if (g_y != NULL)
3539           mpres_get_z (g_y[offset], r1_y[0], modulus);
3540         if (g_x_ntt != NULL)
3541           {
3542             mpres_get_z (mt, r1_x[0], modulus);
3543             mpzspv_from_mpzv (g_x_ntt, offset, &mt, 1UL, ntt_context);
3544           }
3545         if (g_y_ntt != NULL)
3546           {
3547             mpres_get_z (mt, r1_y[0], modulus);
3548             mpzspv_from_mpzv (g_y_ntt, offset, &mt, 1UL, ntt_context);
3549           }
3550       }
3551 
3552 
3553     /* Compute g[1] = r1[1] = x0^(M-1) * r^((M-1)^2) = (x0 * r^(M-1))^(M-1).
3554        We use v[0,1] as temporary storage. FIXME: simplify, reusing g_0 */
3555     gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, M - 1, Delta, modulus,
3556 			  tmplen, tmp);
3557     gfp_ext_mul (v[0], v[1], v[0], v[1], x0_x, x0_y, Delta, modulus,
3558 		 tmplen, tmp);
3559     gfp_ext_pow_norm1_sl (r1_x[1], r1_y[1], v[0], v[1], M - 1, Delta,
3560 			  modulus, tmplen, tmp);
3561     if (l > 1)
3562       {
3563         if (g_x != NULL)
3564           mpres_get_z (g_x[offset + 1], r1_x[1], modulus);
3565         if (g_y != NULL)
3566           mpres_get_z (g_y[offset + 1], r1_y[1], modulus);
3567         if (g_x_ntt != NULL)
3568           {
3569             mpres_get_z (mt, r1_x[1], modulus);
3570             mpzspv_from_mpzv (g_x_ntt, offset + 1, &mt, 1UL, ntt_context);
3571           }
3572         if (g_y_ntt != NULL)
3573           {
3574             mpres_get_z (mt, r1_y[1], modulus);
3575             mpzspv_from_mpzv (g_y_ntt, offset + 1, &mt, 1UL, ntt_context);
3576           }
3577       }
3578 
3579 
3580     /* x0 := $x_0 * r^{2M - 3}$ */
3581     /* We don't need x0 after this so we overwrite it. We use v[0,1] as
3582        temp storage for $r^{2M - 3}$. */
3583     gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, 2UL*M - 3UL, Delta, modulus,
3584 			  tmplen, tmp);
3585     gfp_ext_mul (x0_x, x0_y, x0_x, x0_y, v[0], v[1], Delta, modulus,
3586 		 tmplen, tmp);
3587 
3588     /* Compute r2[0] = r1[0] * r^2 and r2[1] = r1[1] * r^2. */
3589     /* We only need $r^2$ from here on, so we set r = $r^2$ */
3590     gfp_ext_sqr_norm1 (r_x, r_y, r_x, r_y, modulus);
3591     gfp_ext_mul (r2_x[0], r2_y[0], r1_x[0], r1_y[0], r_x, r_y, Delta,
3592 		 modulus, tmplen, tmp);
3593     gfp_ext_mul (r2_x[1], r2_y[1], r1_x[1], r1_y[1], r_x, r_y, Delta,
3594 		 modulus, tmplen, tmp);
3595 
3596     /* v[1] := $x_0 * r^{2*M - 3} + 1/(x_0 * r^{2M - 3}) */
3597     mpres_add (v[1], x0_x, x0_x, modulus);
3598     /* x0 := x0 * r = $x_0 * r^{2M - 1}$ */
3599     gfp_ext_mul (x0_x, x0_y, x0_x, x0_y, r_x, r_y, Delta, modulus,
3600 		 tmplen, tmp);
3601     /* v[0] := $x_0 * r^{2M - 1} + 1/(x_0 * r^{2M - 1}) */
3602     mpres_add (v[0], x0_x, x0_x, modulus);
3603 
3604     /* v2 = V_2 (r + 1/r) = r^2 + 1/r^2 */
3605     mpres_add (v2, r_x, r_x, modulus);
3606 
3607     /* We don't need the contents of r any more and use it as a temp var */
3608 
3609     for (i = 2; i < l; i++)
3610       {
3611 	if (want_x)
3612 	  {
3613 	    /* r1[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i
3614 	       taken modulo 2. We store the new r1_x[i] in r_x for now */
3615 	    mpres_mul (r_x, r2_x[1 - i % 2], v[i % 2], modulus);
3616 	    mpres_sub (r_x, r_x,            r2_x[i % 2], modulus);
3617 	    /* r2[i] = r2[i-1] * v[i-1] - r1[i-2] */
3618 	    mpres_mul (r2_x[i % 2], r2_x[1 - i % 2], v[1 - i % 2], modulus);
3619 	    mpres_sub (r2_x[i % 2], r2_x[i % 2],     r1_x[i % 2], modulus);
3620 	    mpres_set (r1_x[i % 2], r_x, modulus); /* FIXME, avoid this copy */
3621 	    if (g_x != NULL)
3622 	      mpres_get_z (g_x[offset + i], r_x, modulus); /* FIXME, avoid these REDC */
3623 	    if (g_x_ntt != NULL)
3624 	      {
3625 		mpres_get_z (mt, r_x, modulus);
3626 		mpzspv_from_mpzv (g_x_ntt, offset + i, &mt, 1UL, ntt_context);
3627 	      }
3628 	  }
3629 
3630 	if (want_y)
3631 	  {
3632 	    /* Same for y coordinate */
3633 	    mpres_mul (r_y, r2_y[1 - i % 2], v[i % 2], modulus);
3634 	    mpres_sub (r_y, r_y,             r2_y[i % 2], modulus);
3635 	    mpres_mul (r2_y[i % 2], r2_y[1 - i % 2], v[1 - i % 2], modulus);
3636 	    mpres_sub (r2_y[i % 2], r2_y[i % 2],     r1_y[i % 2], modulus);
3637 	    mpres_set (r1_y[i % 2], r_y, modulus);
3638 	    if (g_y != NULL)
3639 	      mpres_get_z (g_y[offset + i], r_y, modulus); /* Keep r1, r2 in mpz_t ? */
3640 	    if (g_y_ntt != NULL)
3641 	      {
3642 		mpres_get_z (mt, r_y, modulus);
3643 		mpzspv_from_mpzv (g_y_ntt, offset + i, &mt, 1UL, ntt_context);
3644 	      }
3645 	  }
3646 
3647 	/* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
3648 	mpres_mul (r_x, v[1 - i % 2], v2, modulus);
3649 	mpres_sub (v[i % 2], r_x, v[i % 2], modulus);
3650 	if (want_output && test_verbose (OUTPUT_TRACE))
3651 	  {
3652 	    mpz_t t;
3653 	    mpz_init (t);
3654 	    mpres_get_z (t, v[i % 2], modulus);
3655 	    outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ "
3656 		     "addrec(x_0 * r^(2*(M-%lu) - 1)) == %Zd /* PARI C */\n",
3657 		     i, t);
3658 	    mpz_clear (t);
3659 	  }
3660       }
3661 
3662     mpres_clear (r_x, modulus);
3663     mpres_clear (r_y, modulus);
3664     mpres_clear (x0_x, modulus);
3665     mpres_clear (x0_y, modulus);
3666     mpres_clear (v2, modulus);
3667     for (i = 0; i < 2; i++)
3668       {
3669 	mpres_clear (r1_x[i], modulus);
3670 	mpres_clear (r1_y[i], modulus);
3671 	mpres_clear (r2_x[i], modulus);
3672 	mpres_clear (r2_y[i], modulus);
3673 	mpres_clear (v[i], modulus);
3674       }
3675     for (i = 0; i < tmplen; i++)
3676       mpres_clear (tmp[i], modulus);
3677     mpz_clear (mt);
3678     mpmod_clear (modulus);
3679 #ifdef _OPENMP
3680   }
3681 #endif
3682 
3683   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
3684 
3685   if (g_x != NULL && g_y != NULL && test_verbose(OUTPUT_TRACE))
3686     {
3687       for (i = 0; i < l_param; i++)
3688 	{
3689 	  outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ g_%lu = "
3690 		   "x_0^(M-%lu) * r^((M-%lu)^2); /* PARI */", i, i, i);
3691 	  outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ g_%lu == "
3692 		   "%Zd + %Zd*w /* PARI C */\n", i, g_x[i], g_y[i]);
3693 	}
3694     }
3695 }
3696 
3697 
3698 /* Compute r[i] = b1^(-P*(k+i)^2) * f_i for i = 0, 1, ..., l-1, where "b1" is
3699    an element of norm 1 in the quadratic extension ring */
3700 
3701 static void
pp1_sequence_h(listz_t h_x,listz_t h_y,mpzspv_t h_x_ntt,mpzspv_t h_y_ntt,const listz_t f,const mpres_t b1_x,const mpres_t b1_y,const long k_param,const unsigned long l_param,const unsigned long P,const mpres_t Delta,mpmod_t modulus_param,const mpzspm_t ntt_context)3702 pp1_sequence_h (listz_t h_x, listz_t h_y, mpzspv_t h_x_ntt, mpzspv_t h_y_ntt,
3703 		const listz_t f, const mpres_t b1_x, const mpres_t b1_y,
3704 		const long k_param, const unsigned long l_param,
3705 		const unsigned long P, const mpres_t Delta,
3706 		mpmod_t modulus_param, const mpzspm_t ntt_context)
3707 {
3708   unsigned long i;
3709   long timestart, realstart;
3710 
3711   if (l_param == 0UL)
3712     return;
3713 
3714   ASSERT (f != h_x);
3715   ASSERT (f != h_y);
3716 
3717   outputf (OUTPUT_VERBOSE, "Computing h_x and h_y");
3718   timestart = cputime ();
3719   realstart = realtime ();
3720 
3721   if (test_verbose (OUTPUT_TRACE))
3722     {
3723       mpz_t t;
3724       mpz_init (t);
3725       mpres_get_z (t, Delta, modulus_param);
3726       outputf (OUTPUT_TRACE, "\n/* pp1_sequence_h */ N = %Zd; "
3727 	       "Delta = %Zd; w = quadgen (4*Delta); k = %ld; P = %lu; "
3728 	       "/* PARI */\n", modulus_param->orig_modulus, t, k_param, P);
3729       outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ b_1 = ");
3730       gfp_ext_print (b1_x, b1_y, modulus_param, OUTPUT_TRACE);
3731       outputf (OUTPUT_TRACE, "; r = b_1^P; rn = b_1^(-P); /* PARI */\n");
3732       for (i = 0; i < l_param; i++)
3733 	outputf (OUTPUT_TRACE,
3734 		 "/* pp1_sequence_h */ f_%lu = %Zd; /* PARI */\n", i, f[i]);
3735       mpz_clear (t);
3736     }
3737 
3738 #ifdef _OPENMP
3739 #pragma omp parallel if (l_param > 100) private(i)
3740 #endif
3741   {
3742     const size_t tmplen = 2;
3743     mpres_t s_x[3], s_y[3], s2_x[2], s2_y[2], v[2], V2, rn_x, rn_y,
3744       tmp[2];
3745     mpmod_t modulus; /* Thread-local copy of modulus_param */
3746     mpz_t mt;
3747     unsigned long l = l_param, offset = 0;
3748     long k = k_param;
3749 
3750 #ifdef _OPENMP
3751     /* When multi-threading, we adjust the parameters for each thread */
3752 
3753     const int nr_chunks = omp_get_num_threads();
3754     const int thread_nr = omp_get_thread_num();
3755 
3756     l = (l_param - 1) / nr_chunks + 1;
3757     offset = thread_nr * l;
3758     if (offset <= l_param)
3759       l = MIN(l, l_param - offset);
3760     else
3761       l = 0;
3762 
3763     if (thread_nr == 0)
3764       outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
3765     outputf (OUTPUT_TRACE, "\n");
3766 #endif
3767 
3768     /* Each thread computes r[i + offset] = b1^(-P*(k+i+offset)^2) * f_i
3769        for i = 0, 1, ..., l-1, where l is the adjusted length of each thread */
3770 
3771     /* Test that k+offset does not overflow */
3772     ASSERT_ALWAYS (offset <= (unsigned long) LONG_MAX &&
3773 		   k <= LONG_MAX - (long) offset);
3774     k += (long) offset;
3775 
3776     mpz_init (mt);
3777     /* Make thread-local copy of modulus */
3778     mpmod_init_set (modulus, modulus_param);
3779 
3780     /* Init the local mpres_t variables */
3781     for (i = 0; i < 2; i++)
3782       {
3783 	mpres_init (s_x[i], modulus);
3784 	mpres_init (s_y[i], modulus);
3785 	mpres_init (s2_x[i], modulus);
3786 	mpres_init (s2_y[i], modulus);
3787 	mpres_init (v[i], modulus);
3788       }
3789     mpres_init (s_x[2], modulus);
3790     mpres_init (s_y[2], modulus);
3791     mpres_init (V2, modulus);
3792     mpres_init (rn_x, modulus);
3793     mpres_init (rn_y, modulus);
3794     for (i = 0; i < (unsigned long) tmplen; i++)
3795       mpres_init (tmp[i], modulus);
3796 
3797     /* Compute rn = b_1^{-P}. It has the same value for all threads,
3798        but we make thread local copies anyway. */
3799     gfp_ext_pow_norm1_sl (rn_x, rn_y, b1_x, b1_y, P, Delta, modulus, tmplen,
3800 			  tmp);
3801     mpres_neg (rn_y, rn_y, modulus);
3802 
3803     /* Compute s[0] = rn^(k^2) = r^(-k^2). We do it by two
3804        exponentiations by k and use v[0] and v[1] as temp storage */
3805     gfp_ext_pow_norm1_sl (v[0], v[1], rn_x, rn_y, k, Delta, modulus,
3806 			  tmplen, tmp);
3807     gfp_ext_pow_norm1_sl (s_x[0], s_y[0], v[0], v[1], k, Delta, modulus,
3808 			  tmplen, tmp);
3809     if (test_verbose (OUTPUT_TRACE))
3810       {
3811 #ifdef _OPENMP
3812 #pragma omp critical
3813 #endif
3814 	{
3815 	  outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2) == ", k);
3816 	  gfp_ext_print (s_x[0], s_y[0], modulus, OUTPUT_TRACE);
3817 	  outputf (OUTPUT_TRACE, " /* PARI C */\n");
3818 	}
3819       }
3820 
3821     /* Compute s[1] = r^(-(k+1)^2) = r^(-(k^2 + 2k + 1))*/
3822     if (l > 1)
3823       {
3824 	/* v[0] + v[1]*sqrt(Delta) still contains rn^k */
3825 	gfp_ext_sqr_norm1 (s_x[1], s_y[1], v[0], v[1], modulus);
3826 	/* Now s[1] = r^(-2k) */
3827 	gfp_ext_mul (s_x[1], s_y[1], s_x[1], s_y[1], s_x[0], s_y[0], Delta,
3828 		     modulus, tmplen, tmp);
3829 	/* Now s[1] = r^(-(k^2 + 2k)) */
3830 	gfp_ext_mul (s_x[1], s_y[1], s_x[1], s_y[1], rn_x, rn_y, Delta,
3831 		     modulus, tmplen, tmp);
3832 	/* Now s[1] = r^(-(k^2 + 2k + 1)) = r^(-(k+1)^2) */
3833 	if (test_verbose (OUTPUT_TRACE))
3834 	  {
3835 #ifdef _OPENMP
3836 #pragma omp critical
3837 #endif
3838 	    {
3839 	      outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2) == ",
3840 		       k + 1);
3841 	      gfp_ext_print (s_x[1], s_y[1], modulus, OUTPUT_TRACE);
3842 	      outputf (OUTPUT_TRACE, " /* PARI C */\n");
3843 	    }
3844 	  }
3845       }
3846 
3847     /* Compute s2[0] = r^(k^2+2) = r^(k^2) * r^2 */
3848     gfp_ext_sqr_norm1 (v[0], v[1], rn_x, rn_y, modulus);
3849     gfp_ext_mul (s2_x[0], s2_y[0], s_x[0], s_y[0], v[0], v[1], Delta, modulus,
3850 		 tmplen, tmp);
3851     if (test_verbose (OUTPUT_TRACE))
3852       {
3853 #ifdef _OPENMP
3854 #pragma omp critical
3855 #endif
3856 	{
3857 	  outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2+2) == ", k);
3858 	  gfp_ext_print (s2_x[0], s2_y[0], modulus, OUTPUT_TRACE);
3859 	  outputf (OUTPUT_TRACE, " /* PARI C */\n");
3860 	}
3861       }
3862 
3863     /* Compute a^((k+1)^2+2) = a^((k+1)^2) * a^2 */
3864     gfp_ext_mul (s2_x[1], s2_y[1], s_x[1], s_y[1], v[0], v[1], Delta, modulus,
3865 		 tmplen, tmp);
3866     if (test_verbose (OUTPUT_TRACE))
3867       {
3868 #ifdef _OPENMP
3869 #pragma omp critical
3870 #endif
3871 	{
3872 	  outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2+2) == ",
3873 		   k + 1);
3874 	  gfp_ext_print (s2_x[1], s2_y[1], modulus, OUTPUT_TRACE);
3875 	  outputf (OUTPUT_TRACE, " /* PARI C */\n");
3876 	}
3877       }
3878 
3879     /* Compute V_2(r + 1/r). Since 1/r = rn_x - rn_y, we have r+1/r = 2*rn_x.
3880        V_2(x) = x^2 - 2, so we want 4*rn_x^2 - 2. */
3881     mpres_add (V2, rn_x, rn_x, modulus); /* V2 = r + 1/r  = 2*rn_x */
3882     V (v[0], V2, 2 * k + 1, modulus);  /* v[0] = V_{2k+1} (r + 1/r) */
3883     V (v[1], V2, 2 * k + 3, modulus);  /* v[1] = V_{2k+3} (r + 1/r) */
3884     mpres_sqr (V2, V2, modulus);       /* V2 = 4*a_x^2 */
3885     mpres_sub_ui (V2, V2, 2UL, modulus); /* V2 = 4*a_x^2 - 2 */
3886     if (test_verbose (OUTPUT_TRACE))
3887       {
3888 #ifdef _OPENMP
3889 #pragma omp critical
3890 #endif
3891 	{
3892 	  mpres_get_z (mt, V2, modulus);
3893 	  outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^2 + 1/r^2 == %Zd "
3894 		   "/* PARI C */\n", mt);
3895 	  mpres_get_z (mt, v[0], modulus);
3896 	  outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^(2*%ld+1) + "
3897 		   "1/r^(2*%ld+1) == %Zd /* PARI C */\n", k, k, mt);
3898 	  mpres_get_z (mt, v[1], modulus);
3899 	  outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^(2*%ld+3) + "
3900 		   "1/r^(2*%ld+3) == %Zd /* PARI C */\n", k, k, mt);
3901 	}
3902       }
3903 
3904     for (i = 0; i < 2UL && i < l; i++)
3905       {
3906 	/* Multiply the 2nd coordinate by Delta, so that after the polynomial
3907 	   multipoint evaluation we get x1 + Delta*x2 */
3908 	mpres_mul (s_y[i], s_y[i], Delta, modulus);
3909 	mpres_mul (s2_y[i], s2_y[i], Delta, modulus);
3910 
3911 	if (h_x != NULL)
3912 	  mpres_mul_z_to_z (h_x[i + offset], s_x[i], f[i + offset], modulus);
3913 	if (h_y != NULL)
3914 	  mpres_mul_z_to_z (h_y[i + offset], s_y[i], f[i + offset], modulus);
3915 	if (h_x_ntt != NULL)
3916 	  {
3917 	    mpres_mul_z_to_z (mt, s_x[i], f[i + offset], modulus);
3918 	    mpzspv_from_mpzv (h_x_ntt, i + offset, &mt, 1UL, ntt_context);
3919 	  }
3920 	if (h_y_ntt != NULL)
3921 	  {
3922 	    mpres_mul_z_to_z (mt, s_y[i], f[i + offset], modulus);
3923 	    mpzspv_from_mpzv (h_y_ntt, i + offset, &mt, 1UL, ntt_context);
3924 	  }
3925       }
3926 
3927     /* Compute the remaining r^((k+i)^2) values according to Peter's
3928        recurrence */
3929 
3930     for (i = 2; i < l; i++)
3931       {
3932 	if (h_x != NULL || h_x_ntt != NULL)
3933 	  {
3934 	    /* r[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i
3935 	       taken modulo 2 */
3936 	    mpres_mul (s_x[i % 3], s2_x[1 - i % 2], v[i % 2], modulus);
3937 	    mpres_sub (s_x[i % 3], s_x[i % 3], s2_x[i % 2], modulus);
3938 
3939 	    /* r2[i] = r2[i-1] * v[i-1] - r[i-2] */
3940 	    mpres_mul (s2_x[i % 2], s2_x[1 - i % 2], v[1 - i % 2], modulus);
3941 	    mpres_sub (s2_x[i % 2], s2_x[i % 2], s_x[(i - 2) % 3], modulus);
3942 	    if (h_x != NULL)
3943 	      mpres_mul_z_to_z (h_x[i + offset], s_x[i % 3], f[i + offset],
3944 				modulus);
3945 	    if (h_x_ntt != NULL)
3946 	      {
3947 		mpres_mul_z_to_z (mt, s_x[i % 3], f[i + offset], modulus);
3948 		mpzspv_from_mpzv (h_x_ntt, i + offset, &mt, 1UL, ntt_context);
3949 	      }
3950 	  }
3951 
3952 	if (h_y != NULL || h_y_ntt != NULL)
3953 	  {
3954 	    /* Same for y coordinate */
3955 	    mpres_mul (s_y[i % 3], s2_y[1 - i % 2], v[i % 2], modulus);
3956 	    mpres_sub (s_y[i % 3], s_y[i % 3], s2_y[i % 2], modulus);
3957 	    mpres_mul (s2_y[i % 2], s2_y[1 - i % 2], v[1 - i % 2], modulus);
3958 	    mpres_sub (s2_y[i % 2], s2_y[i % 2], s_y[(i - 2) % 3], modulus);
3959 	    if (h_y != NULL)
3960 	      mpres_mul_z_to_z (h_y[i + offset], s_y[i % 3], f[i + offset],
3961 				modulus);
3962 	    if (h_y_ntt != NULL)
3963 	      {
3964 		mpres_mul_z_to_z (mt, s_y[i % 3], f[i + offset], modulus);
3965 		mpzspv_from_mpzv (h_y_ntt, i + offset, &mt, 1UL, ntt_context);
3966 	      }
3967 	  }
3968 
3969 	/* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
3970 	mpres_mul (tmp[0], v[1 - i % 2], V2, modulus);
3971 	mpres_sub (v[i % 2], tmp[0], v[i % 2], modulus);
3972       }
3973 
3974     /* Clear the local mpres_t variables */
3975     for (i = 0; i < 2; i++)
3976       {
3977 	mpres_clear (s_x[i], modulus);
3978 	mpres_clear (s_y[i], modulus);
3979 	mpres_clear (s2_x[i], modulus);
3980 	mpres_clear (s2_y[i], modulus);
3981 	mpres_clear (v[i], modulus);
3982       }
3983     mpres_clear (s_x[2], modulus);
3984     mpres_clear (s_y[2], modulus);
3985     mpres_clear (V2, modulus);
3986     mpres_clear (rn_x, modulus);
3987     mpres_clear (rn_y, modulus);
3988     for (i = 0; i < tmplen; i++)
3989       mpres_clear (tmp[i], modulus);
3990 
3991     /* Clear the thread-local copy of modulus */
3992     mpmod_clear (modulus);
3993 
3994     mpz_clear (mt);
3995   }
3996 
3997   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
3998 
3999   if (h_x != NULL && h_y != NULL && test_verbose (OUTPUT_TRACE))
4000     {
4001       for (i = 0; i < l_param; i++)
4002 	gmp_printf ("/* pp1_sequence_h */ (rn^((k+%lu)^2) * f_%lu) == "
4003 		    "(%Zd + Mod(%Zd / Delta, N) * w) /* PARI C */\n",
4004 		    i, i, h_x[i], h_y[i]);
4005     }
4006 }
4007 
4008 
4009 int
pp1fs2(mpz_t f,const mpres_t X,mpmod_t modulus,const faststage2_param_t * params)4010 pp1fs2 (mpz_t f, const mpres_t X, mpmod_t modulus,
4011 	const faststage2_param_t *params)
4012 {
4013   unsigned long nr;
4014   unsigned long i, l, lenF, lenH, lenG, lenR, tmplen;
4015   sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
4016                        progressions of prime length */
4017   set_long_t *S_2; /* This is stored as a regular set */
4018   listz_t F;   /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
4019 		  degree s_1. It is symmetric, so has only s_1 / 2 + 1
4020 		  distinct coefficients. The sequence h_j will be stored in
4021 		  the same memory and won't be a monic polynomial, so the
4022 		  leading 1 monomial of F will be stored explicitly. Hence we
4023 		  need s_1 / 2 + 1 entries. */
4024 
4025   listz_t g_x, g_y, fh_x, fh_y, h_x, h_y, tmp, R_x, R_y;
4026   const unsigned long tmpreslen = 2UL;
4027   mpres_t b1_x, b1_y, Delta, tmpres[2];
4028   mpz_t mt;   /* All-purpose temp mpz_t */
4029   int youpi = ECM_NO_FACTOR_FOUND;
4030   long timetotalstart, realtotalstart, timestart;
4031 
4032   timetotalstart = cputime ();
4033   realtotalstart = realtime ();
4034 
4035   ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
4036   ASSERT_ALWAYS (params->s_1 < params->l);
4037   nr = params->l - params->s_1; /* Number of points we evaluate */
4038 
4039   if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
4040       return ECM_ERROR;
4041 
4042   /* Allocate all the memory we'll need */
4043   /* Allocate the correct amount of space for each mpz_t or the
4044      reallocations will up to double the time for stage 2! */
4045   mpz_init (mt);
4046   mpres_init (b1_x, modulus);
4047   mpres_init (b1_y, modulus);
4048   mpres_init (Delta, modulus);
4049   for (i = 0; i < tmpreslen; i++)
4050       mpres_init (tmpres[i], modulus);
4051   lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
4052 				     the leading 1 monomial for each factor */
4053   lenH = params->s_1 + 1;
4054   lenG = params->l;
4055   lenR = nr;
4056   F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
4057   fh_x = init_list2 (lenF, (unsigned int) abs (modulus->bits));
4058   fh_y = init_list2 (lenF, (unsigned int) abs (modulus->bits));
4059   h_x = malloc (lenH * sizeof (mpz_t));
4060   h_y = malloc (lenH * sizeof (mpz_t));
4061   if (h_x == NULL || h_y == NULL)
4062     {
4063       fprintf (stderr, "Cannot allocate memory in pp1fs2\n");
4064       exit (1);
4065     }
4066   g_x = init_list2 (lenG, (unsigned int) abs (modulus->bits));
4067   g_y = init_list2 (lenG, (unsigned int) abs (modulus->bits));
4068   R_x = init_list2 (lenR, (unsigned int) abs (modulus->bits));
4069   R_y = init_list2 (lenR, (unsigned int) abs (modulus->bits));
4070   tmplen = 3UL * params->l + list_mul_mem (params->l / 2) + 20;
4071   outputf (OUTPUT_DEVVERBOSE, "tmplen = %lu\n", tmplen);
4072   if (TMulGen_space (params->l - 1, params->s_1, lenR) + 12 > tmplen)
4073     {
4074       tmplen = TMulGen_space (params->l - 1, params->s_1 - 1, lenR) + 12;
4075       /* FIXME: It appears TMulGen_space() returns a too small value! */
4076       outputf (OUTPUT_DEVVERBOSE, "With TMulGen_space, tmplen = %lu\n",
4077 	       tmplen);
4078     }
4079 
4080   tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
4081 
4082   if (test_verbose (OUTPUT_TRACE))
4083     {
4084       mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
4085       outputf (OUTPUT_TRACE,
4086 	       "N = %Zd; X = Mod(%Zd, N); /* PARI */\n",
4087 	       modulus->orig_modulus, mt);
4088     }
4089 
4090   /* Compute the polynomial f(x) = \prod_{k_1 in S_1} (x - X^{2 k_1}) */
4091   outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");
4092 
4093   timestart = cputime ();
4094   i = poly_from_sets_V (F, X, S_1, tmp, tmplen, modulus, NULL, NULL);
4095   ASSERT_ALWAYS(2 * i == params->s_1);
4096   ASSERT(mpz_cmp_ui (F[i], 1UL) == 0);
4097   free (S_1);
4098   S_1 = NULL;
4099 
4100   outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
4101   if (test_verbose (OUTPUT_TRACE))
4102     {
4103       for (i = 0; i < params->s_1 / 2 + 1; i++)
4104 	outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
4105       outputf (OUTPUT_TRACE, "f(x) = f_0");
4106       for (i = 1; i < params->s_1 / 2 + 1; i++)
4107 	outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
4108       outputf (OUTPUT_TRACE, "/* PARI */ \n");
4109     }
4110 
4111   /* Compute Delta and b1_x + b1_y * sqrt(Delta) = X) */
4112   mpres_sqr (Delta, X, modulus);
4113   mpres_sub_ui (Delta, Delta, 4UL, modulus);
4114   mpres_div_2exp (b1_x, X, 1, modulus);
4115   mpres_set_ui (b1_y, 1UL, modulus);
4116   mpres_div_2exp (b1_y, b1_y, 1, modulus);
4117   if (test_verbose (OUTPUT_TRACE))
4118     {
4119       mpres_get_z (mt, Delta, modulus);
4120       outputf (OUTPUT_TRACE,
4121 	       "Delta = Mod(%Zd, N); w = quadgen (4*lift(Delta)); b_1 = ", mt);
4122       gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
4123       outputf (OUTPUT_TRACE, "; /* PARI */\n");
4124       outputf (OUTPUT_TRACE, "X == b_1 + 1/b_1 /* PARI C */\n");
4125     }
4126 
4127   /* Compute the h sequence h_j = b1^(P*-j^2) * f_j for 0 <= j <= s_1 */
4128   pp1_sequence_h (fh_x, fh_y, NULL, NULL, F, b1_x, b1_y, 0L,
4129 		  params->s_1 / 2 + 1, params->P, Delta, modulus, NULL);
4130   /* We don't need F(x) any more */
4131   clear_list (F, lenF);
4132 
4133   /* Make a symmetric copy of fh in h. */
4134   for (i = 0; i < params->s_1 / 2 + 1; i++)
4135     {
4136       *(h_x[i]) = *(fh_x[params->s_1 / 2 - i]); /* Clone the mpz_t */
4137       *(h_y[i]) = *(fh_y[params->s_1 / 2 - i]);
4138     }
4139   for (i = 0; i < params->s_1 / 2; i++)
4140     {
4141       *(h_x[i + params->s_1 / 2 + 1]) = *(fh_x[i + 1]);
4142       *(h_y[i + params->s_1 / 2 + 1]) = *(fh_y[i + 1]);
4143     }
4144   if (test_verbose (OUTPUT_TRACE))
4145     {
4146       for (i = 0; i < params->s_1 + 1; i++)
4147 	outputf (OUTPUT_VERBOSE, "h_%lu = %Zd + %Zd * w; /* PARI */\n",
4148 		 i, h_x[i], h_y[i]);
4149     }
4150 
4151   for (l = 0; l < params->s_2; l++)
4152     {
4153       const long M = params->l - 1 - params->s_1 / 2;
4154       outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
4155                l + 1, params->s_2);
4156       pp1_sequence_g (g_x, g_y, NULL, NULL, b1_x, b1_y, params->P,
4157 		      Delta, M, params->l, params->m_1, S_2->elem[l],
4158 		      modulus, NULL);
4159 
4160       /* Do the two convolution products */
4161       outputf (OUTPUT_VERBOSE, "TMulGen of g_x and h_x");
4162       timestart = cputime ();
4163       if (TMulGen (R_x, nr - 1, h_x, params->s_1, g_x, params->l - 1, tmp,
4164 		   modulus->orig_modulus) < 0)
4165 	{
4166 	  outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
4167 		   "of memory)\n");
4168 	  youpi = ECM_ERROR;
4169 	  break;
4170 	}
4171       outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
4172       outputf (OUTPUT_VERBOSE, "TMulGen of g_y and h_y");
4173       timestart = cputime ();
4174       if (TMulGen (R_y, nr - 1, h_y, params->s_1, g_y, params->l - 1, tmp,
4175 		   modulus->orig_modulus) < 0)
4176 	{
4177 	  outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
4178 		   "of memory)\n");
4179 	  youpi = ECM_ERROR;
4180 	  break;
4181 	}
4182       outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
4183       for (i = 0; i < nr; i++)
4184 	  mpz_add (R_x[i], R_x[i], R_y[i]);
4185 
4186       timestart = cputime ();
4187       mpres_set_ui (tmpres[1], 1UL, modulus); /* Accumulate product in
4188 						 tmpres[1] */
4189       for (i = 0; i < nr; i++)
4190       {
4191 	  mpres_set_z_for_gcd (tmpres[0], R_x[i], modulus);
4192 #define TEST_ZERO_RESULT
4193 #ifdef TEST_ZERO_RESULT
4194 	  if (mpres_is_zero (tmpres[0], modulus))
4195 	      outputf (OUTPUT_VERBOSE, "R_[%lu] = 0\n", i);
4196 #endif
4197 	  mpres_mul (tmpres[1], tmpres[1], tmpres[0], modulus);
4198       }
4199       outputf (OUTPUT_VERBOSE, "Computing product of F(g_i)^(1) took %lums\n",
4200 	       cputime () - timestart);
4201       if (test_verbose(OUTPUT_RESVERBOSE))
4202       {
4203 	  mpres_get_z (mt, tmpres[1], modulus);
4204 	  outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", mt);
4205       }
4206 
4207       mpres_gcd (mt, tmpres[1], modulus);
4208       if (mpz_cmp_ui (mt, 1UL) > 0)
4209 	{
4210 	  mpz_set (f, mt);
4211 	  youpi = ECM_FACTOR_FOUND_STEP2;
4212 	  break;
4213 	}
4214     }
4215 
4216   mpz_clear (mt);
4217   mpres_clear (b1_x, modulus);
4218   mpres_clear (b1_y, modulus);
4219   mpres_clear (Delta, modulus);
4220   for (i = 0; i < tmpreslen; i++)
4221       mpres_clear (tmpres[i], modulus);
4222   clear_list (fh_x, lenF);
4223   clear_list (fh_y, lenF);
4224   free (h_x);
4225   free (h_y);
4226   clear_list (g_x, lenG);
4227   clear_list (g_y, lenG);
4228   clear_list (R_x, lenR);
4229   clear_list (R_y, lenR);
4230   clear_list (tmp, tmplen);
4231   free (S_2);
4232 
4233   outputf (OUTPUT_NORMAL, "Step 2");
4234   /* In normal output mode, print only cpu time as we always have.
4235      In verbose mode, print real time as well if we used multi-threading */
4236   if (test_verbose (OUTPUT_VERBOSE))
4237     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
4238   else
4239     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
4240 
4241   return youpi;
4242 }
4243 
4244 
4245 int
pp1fs2_ntt(mpz_t f,const mpres_t X,mpmod_t modulus,const faststage2_param_t * params,const int twopass)4246 pp1fs2_ntt (mpz_t f, const mpres_t X, mpmod_t modulus,
4247 	    const faststage2_param_t *params, const int twopass)
4248 {
4249   unsigned long nr;
4250   unsigned long l, lenF;
4251   sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
4252                        progressions of prime length */
4253   set_long_t *S_2; /* This is stored as a regular set */
4254   listz_t F;   /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
4255 		  degree s_1. It is symmetric, so has only s_1 / 2 + 1
4256 		  distinct coefficients. The sequence h_j will be stored in
4257 		  the same memory and won't be a monic polynomial, so the
4258 		  leading 1 monomial of F will be stored explicitly. Hence we
4259 		  need s_1 / 2 + 1 entries. */
4260   listz_t R = NULL;  /* Is used only for two-pass convolution, has nr
4261 			entries. R is only ever referenced if twopass == 1,
4262 			but gcc does not realize that and complains about
4263 			uninitialized value, so we set it to NULL. */
4264   mpzspm_t ntt_context;
4265   mpzspv_t g_x_ntt, g_y_ntt, h_x_ntt, h_y_ntt;
4266   mpres_t b1_x, b1_y, Delta;
4267   mpz_t mt;   /* All-purpose temp mpz_t */
4268   mpz_t product;
4269   mpz_t *product_ptr = NULL;
4270   int youpi = ECM_NO_FACTOR_FOUND;
4271   long timetotalstart, realtotalstart, timestart, realstart;
4272 
4273   timetotalstart = cputime ();
4274   realtotalstart = realtime ();
4275 
4276   ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
4277   ASSERT_ALWAYS (params->s_1 < params->l);
4278   nr = params->l - params->s_1; /* Number of points we evaluate */
4279 
4280   if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
4281       return ECM_ERROR;
4282 
4283   mpz_init (mt);
4284 
4285   /* Prepare NTT for computing the h sequence, its DCT-I, and the convolution
4286      with g. We need NTT of transform length l here. If we want to add
4287      transformed vectors, we need to double the modulus. */
4288 
4289   if (twopass)
4290     mpz_set (mt, modulus->orig_modulus);
4291   else
4292     mpz_mul_2exp (mt, modulus->orig_modulus, 1UL);
4293 
4294   ntt_context = mpzspm_init (params->l, mt);
4295 
4296   if (ntt_context == NULL)
4297     {
4298       outputf (OUTPUT_ERROR, "Could not initialise ntt_context, "
4299                "presumably out of memory\n");
4300       mpz_clear (mt);
4301       free (S_1);
4302       S_1 = NULL;
4303       free (S_2);
4304       S_2 = NULL;
4305       return ECM_ERROR;
4306     }
4307 
4308   print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for evaluation = ",
4309 		    ntt_context);
4310 
4311   /* Allocate memory for F with correct amount of space for each mpz_t */
4312   lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
4313 				     the leading 1 monomial for each factor */
4314   {
4315     /* in some cases the above value of lenF is not enough, for example with
4316        s_1 = 10, which gives lenF = 7, but 9 entries are needed */
4317     unsigned long mem = mem_poly_from_sets_V (S_1);
4318     if (mem > lenF)
4319       lenF = mem;
4320   }
4321 
4322   F = init_list2 (lenF, (unsigned int) abs (modulus->bits) + GMP_NUMB_BITS);
4323 
4324   /* Build F */
4325   if (build_F_ntt (F, X, S_1, params, modulus) == ECM_ERROR)
4326     {
4327       free (S_1);
4328       free (S_2);
4329       mpz_clear (mt);
4330       mpzspm_clear (ntt_context);
4331       clear_list (F, lenF);
4332       return ECM_ERROR;
4333     }
4334 
4335   free (S_1);
4336   S_1 = NULL;
4337 
4338   mpres_init (b1_x, modulus);
4339   mpres_init (b1_y, modulus);
4340   mpres_init (Delta, modulus);
4341 
4342   /* Compute Delta and b1_x + b1_y * sqrt(Delta) = X) */
4343   mpres_sqr (Delta, X, modulus);
4344   mpres_sub_ui (Delta, Delta, 4UL, modulus);
4345   mpres_div_2exp (b1_x, X, 1, modulus);
4346   mpres_set_ui (b1_y, 1UL, modulus);
4347   mpres_div_2exp (b1_y, b1_y, 1, modulus);
4348   if (test_verbose (OUTPUT_TRACE))
4349     {
4350       mpres_get_z (mt, Delta, modulus);
4351       outputf (OUTPUT_TRACE,
4352 	       "Delta = Mod(%Zd, N); w = quadgen (4*lift(Delta)); b_1 = ", mt);
4353       gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
4354       outputf (OUTPUT_TRACE, "; /* PARI */\n");
4355       outputf (OUTPUT_TRACE, "X == b_1 + 1/b_1 /* PARI C */\n");
4356     }
4357 
4358   /* Allocate remaining memory for h_ntt */
4359   h_x_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
4360   h_y_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
4361   /* Compute the h_j sequence */
4362   pp1_sequence_h (NULL, NULL, h_x_ntt, h_y_ntt, F, b1_x, b1_y, 0L,
4363 		  params->s_1 / 2 + 1, params->P, Delta, modulus,
4364 		  ntt_context);
4365   /* We don't need F(x) any more */
4366   clear_list (F, lenF);
4367 
4368   /* compute the forward transform of h and store the distinct coefficients
4369      in h_ntt */
4370   g_x_ntt = mpzspv_init (params->l, ntt_context);
4371   if (twopass)
4372     {
4373       g_y_ntt = g_x_ntt;
4374       R = init_list2 (nr, (mpz_size (modulus->orig_modulus) + 2) *
4375                           GMP_NUMB_BITS);
4376     }
4377   else
4378     g_y_ntt = mpzspv_init (params->l, ntt_context);
4379 
4380   /* Compute DCT-I of h_x and h_y */
4381   outputf (OUTPUT_VERBOSE, "Computing DCT-I of h_x");
4382 #ifdef _OPENMP
4383   outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4384 #endif
4385   timestart = cputime ();
4386   realstart = realtime ();
4387   mpzspv_to_dct1 (h_x_ntt, h_x_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
4388 		  g_x_ntt, ntt_context);
4389   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4390 
4391   outputf (OUTPUT_VERBOSE, "Computing DCT-I of h_y");
4392 #ifdef _OPENMP
4393   outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4394 #endif
4395   timestart = cputime ();
4396   realstart = realtime ();
4397   mpzspv_to_dct1 (h_y_ntt, h_y_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
4398 		  g_x_ntt, ntt_context);
4399   print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4400 
4401   if (test_verbose (OUTPUT_RESVERBOSE))
4402     {
4403       mpz_init (product);
4404       product_ptr = &product;
4405     }
4406 
4407   for (l = 0; l < params->s_2; l++)
4408     {
4409       const long M = params->l - 1 - params->s_1 / 2;
4410 
4411       outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
4412                l + 1, params->s_2);
4413       if (twopass)
4414 	{
4415 	  /* Two-pass variant. Two separate convolutions,
4416 	     then addition in Z/NZ */
4417 	  pp1_sequence_g (NULL, NULL, g_x_ntt, NULL, b1_x, b1_y, params->P,
4418 			  Delta, M, params->l, params->m_1, S_2->elem[l],
4419 			  modulus, ntt_context);
4420 
4421 	  /* Do the convolution product of g_x * h_x */
4422 	  outputf (OUTPUT_VERBOSE, "Computing g_x*h_x");
4423 #ifdef _OPENMP
4424           outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4425 #endif
4426 	  timestart = cputime ();
4427 	  realstart = realtime ();
4428 	  mpzspv_mul_by_dct (g_x_ntt, h_x_ntt, params->l, ntt_context,
4429 	    NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
4430 	  /* Store the product coefficients we want in R */
4431 	  mpzspv_to_mpzv (g_x_ntt, params->s_1 / 2, R, nr, ntt_context);
4432 	  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4433 
4434 	  /* Compute g_y sequence */
4435 	  pp1_sequence_g (NULL, NULL, NULL, g_y_ntt, b1_x, b1_y, params->P,
4436 			  Delta, M, params->l, params->m_1, S_2->elem[l],
4437 			  modulus, ntt_context);
4438 
4439 	  /* Do the convolution product of g_y * (Delta * h_y) */
4440 	  outputf (OUTPUT_VERBOSE, "Computing g_y*h_y");
4441 #ifdef _OPENMP
4442           outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4443 #endif
4444 	  timestart = cputime ();
4445 	  realstart = realtime ();
4446 	  mpzspv_mul_by_dct (g_y_ntt, h_y_ntt, params->l, ntt_context,
4447 	    NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
4448 	  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4449 
4450 	  /* Compute product of sum of coefficients and gcd with N */
4451 	  ntt_gcd (mt, product_ptr, g_y_ntt, params->s_1 / 2, R, nr,
4452 		   ntt_context, modulus);
4453 	}
4454       else
4455 	{
4456 	  /* One-pass variant. Two forward transforms and point-wise products,
4457 	     then addition and single inverse transform */
4458 	  pp1_sequence_g (NULL, NULL, g_x_ntt, g_y_ntt, b1_x, b1_y, params->P,
4459 			  Delta, M, params->l, params->m_1, S_2->elem[l],
4460 			  modulus, ntt_context);
4461 
4462 	  outputf (OUTPUT_VERBOSE, "Computing forward NTT of g_x");
4463 #ifdef _OPENMP
4464           outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4465 #endif
4466 	  timestart = cputime ();
4467 	  realstart = realtime ();
4468 	  mpzspv_mul_by_dct (g_x_ntt, h_x_ntt, params->l, ntt_context,
4469 	    NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL);
4470 	  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4471 
4472 	  outputf (OUTPUT_VERBOSE, "Computing forward NTT of g_y");
4473 #ifdef _OPENMP
4474           outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4475 #endif
4476 	  timestart = cputime ();
4477 	  realstart = realtime ();
4478 	  mpzspv_mul_by_dct (g_y_ntt, h_y_ntt, params->l, ntt_context,
4479 	    NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL);
4480 	  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4481 
4482 	  outputf (OUTPUT_VERBOSE, "Adding and computing inverse NTT of sum");
4483 #ifdef _OPENMP
4484           outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4485 #endif
4486 	  timestart = cputime ();
4487 	  realstart = realtime ();
4488 	  mpzspv_add (g_x_ntt, (spv_size_t) 0, g_x_ntt, (spv_size_t) 0,
4489 	              g_y_ntt, (spv_size_t) 0, params->l, ntt_context);
4490 	  mpzspv_mul_by_dct (g_x_ntt, NULL, params->l, ntt_context,
4491 	    NTT_MUL_STEP_IFFT);
4492 	  print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4493 
4494 	  ntt_gcd (mt, product_ptr, g_x_ntt, params->s_1 / 2, NULL, nr,
4495 		   ntt_context, modulus);
4496 	}
4497 
4498       outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", product);
4499 
4500       if (mpz_cmp_ui (mt, 1UL) > 0)
4501 	{
4502 	  mpz_set (f, mt);
4503 	  youpi = ECM_FACTOR_FOUND_STEP2;
4504 	  break;
4505 	}
4506     }
4507 
4508   if (test_verbose (OUTPUT_RESVERBOSE))
4509     {
4510       product_ptr = NULL;
4511       mpz_clear (product);
4512     }
4513   mpzspv_clear (g_x_ntt, ntt_context);
4514   if (twopass)
4515     clear_list (R, nr);
4516   else
4517     mpzspv_clear (g_y_ntt, ntt_context);
4518   mpzspv_clear (h_x_ntt, ntt_context);
4519   mpzspv_clear (h_y_ntt, ntt_context);
4520   mpzspm_clear (ntt_context);
4521   mpz_clear (mt);
4522   mpres_clear (b1_x, modulus);
4523   mpres_clear (b1_y, modulus);
4524   mpres_clear (Delta, modulus);
4525   free (S_2);
4526 
4527   outputf (OUTPUT_NORMAL, "Step 2");
4528   /* In normal output mode, print only cpu time as we always have.
4529      In verbose mode, print real time as well if we used multi-threading */
4530   if (test_verbose (OUTPUT_VERBOSE))
4531     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
4532   else
4533     print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
4534 
4535   return youpi;
4536 }
4537