1 /* Implementation of fast stage 2 for P-1 and P+1 as described in
2 "Improved Stage 2 to $P\pm{}1$ Factoring Algorithms" by
3 Peter L. Montgomery and Alexander Kruppa, ANTS 2008 (8th Algorithmic
4 Number Theory Symposium).
5
6 Copyright 2007, 2008, 2009, 2010, 2011, 2012, 2016
7 Alexander Kruppa, Paul Zimmermann, David Cleaver
8 NTT functions are based on code Copyright 2005 Dave Newman.
9
10 This file is part of the ECM Library.
11
12 The ECM Library is free software; you can redistribute it and/or modify
13 it under the terms of the GNU Lesser General Public License as published by
14 the Free Software Foundation; either version 3 of the License, or (at your
15 option) any later version.
16
17 The ECM Library is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
19 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
20 License for more details.
21
22 You should have received a copy of the GNU Lesser General Public License
23 along with the ECM Library; see the file COPYING.LIB. If not, see
24 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
25 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
26
27 #include "config.h"
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <limits.h>
31 #include "ecm-impl.h"
32 #include "sp.h"
33 #include <math.h>
34 #ifdef HAVE_ALLOCA_H
35 #include <alloca.h>
36 #endif
37 #ifdef HAVE_STRING_H
38 #include <string.h>
39 #endif
40 #ifdef _OPENMP
41 #include <omp.h>
42 #endif
43
44 /* TODO:
45 - move functions into their proper files (i.e. NTT functions etc.)
46 - later: allow storing NTT vectors on disk
47 */
48
49 /* Define TEST_ZERO_RESULT to test if any result of the multipoint
50 evaluation is equal to zero. If the modulus is composite, this
51 happening might indicate a problem in the evalutaion code */
52 #define TEST_ZERO_RESULT
53
54 const int pari = 0;
55
56 const unsigned long Pvalues[] = {
57 3UL, 5UL, 9UL, 15UL, 21UL, 17UL, 27UL, 33UL, 45UL, 51UL, 63UL, 75UL,
58 105UL, 99UL, 135UL, 165UL, 195UL, 189UL, 231UL, 255UL, 315UL, 345UL,
59 357UL, 375UL, 405UL, 435UL, 525UL, 585UL, 615UL, 735UL, 765UL, 825UL,
60 945UL, 1155UL, 1065UL, 1365UL, 1305UL, 1335UL, 1575UL, 1785UL, 1995UL,
61 2145UL, 2205UL, 2415UL, 2625UL, 2805UL, 3045UL, 3465UL, 3675UL, 4095UL,
62 4305UL, 4515UL, 4725UL, 4785UL, 5355UL, 5775UL, 5985UL, 5865UL, 6825UL,
63 7245UL, 8085UL, 8925UL, 9555UL, 10395UL, 10725UL, 11025UL, 12285UL,
64 12705UL, 15015UL, 14175UL, 15225UL, 16065UL, 17325UL, 19635UL, 21945UL,
65 23205UL, 24255UL, 25935UL, 26775UL, 28875UL, 31395UL, 33495UL, 35805UL,
66 36465UL, 38115UL, 39585UL, 40425UL, 45045UL, 45885UL, 49665UL, 51765UL,
67 58905UL, 65835UL, 69615UL, 75075UL, 77805UL, 82005UL, 84315UL, 86625UL,
68 88935UL, 94185UL, 98175UL, 105105UL, 109725UL, 116025UL, 118755UL,
69 121275UL, 135135UL, 137445UL, 137655UL, 144375UL, 153615UL, 165165UL,
70 167475UL, 176715UL, 179025UL, 185955UL, 197505UL, 208845UL, 215985UL,
71 225225UL, 255255UL, 250635UL, 285285UL, 277095UL, 294525UL, 315315UL,
72 345345UL, 373065UL, 368445UL, 405405UL, 435435UL, 451605UL, 465465UL,
73 454545UL, 504735UL, 525525UL, 555555UL, 569415UL, 596505UL, 645645UL,
74 647955UL, 672945UL, 687225UL, 765765UL, 770385UL, 805035UL, 855855UL,
75 858585UL, 915915UL, 945945UL, 962115UL, 1036035UL, 1066065UL, 1119195UL,
76 1156155UL, 1276275UL, 1306305UL, 1354815UL, 1426425UL, 1456455UL,
77 1514205UL, 1576575UL, 1666665UL, 1726725UL, 1786785UL, 1789515UL,
78 1865325UL, 1996995UL, 1983135UL, 2177175UL, 2297295UL, 2327325UL,
79 2417415UL, 2567565UL, 2611455UL, 2807805UL, 2847075UL, 2878785UL,
80 3048045UL, 3161235UL, 3258255UL, 3357585UL, 3401475UL, 3533145UL,
81 3828825UL, 3918915UL, 3985905UL, 4279275UL, 4849845UL, 4789785UL,
82 4967655UL, 5180175UL, 5360355UL, 5870865UL, 5990985UL, 6561555UL,
83 6531525UL, 6891885UL, 7402395UL, 7912905UL, 8273265UL, 8580495UL,
84 8843835UL, 9444435UL, 10015005UL, 10465455UL, 10705695UL, 10885875UL,
85 11696685UL, 12267255UL, 12507495UL, 12785955UL, 13498485UL, 14549535UL,
86 14849835UL, 15570555UL, 16111095UL, 16291275UL, 17612595UL, 18123105UL,
87 18633615UL, 19684665UL, 20255235UL, 20825805UL, 22207185UL, 22717695UL,
88 24249225UL, 24819795UL, 25741485UL, 26531505UL, 28333305UL, 29354325UL,
89 30045015UL, 31396365UL, 32807775UL, 33948915UL, 33528495UL, 34879845UL,
90 37011975UL, 37522485UL, 39564525UL, 41096055UL, 43648605UL, 44219175UL,
91 45930885UL, 47222175UL, 48333285UL, 50075025UL, 51816765UL, 52777725UL,
92 55390335UL, 55547415UL, 59053995UL, 60063465UL, 61906845UL, 64579515UL,
93 66621555UL, 67492425UL, 70105035UL, 73258185UL, 74939865UL, 77224455UL,
94 79594515UL, 81876795UL, 84999915UL, 88062975UL, 91005915UL, 94189095UL,
95 98423325UL, 101846745UL, 111546435UL, 111035925UL, 115120005UL,
96 121246125UL, 124098975UL, 130945815UL, 140645505UL, 150345195UL,
97 150225075UL, 155450295UL, 158333175UL, 170255085UL, 179444265UL,
98 190285095UL, 198843645UL, 203408205UL, 206831625UL, 217222005UL,
99 229474245UL, 240705465UL, 252447195UL, 254999745UL, 269023755UL,
100 282146865UL, 287672385UL, 294076965UL, 306110805UL, 318302985UL,
101 334639305UL, 344338995UL, 354038685UL, 363738375UL, 373438065UL,
102 387221835UL, 400254855UL, 421936515UL, 431636205UL, 451035585UL,
103 453888435UL, 470434965UL, 480134655UL, 510765255UL, 522506985UL,
104 557732175UL, 570855285UL, 596530935UL, 610224615UL, 627912285UL,
105 654729075UL, 703227525UL, 722116395UL, 751725975UL, 780825045UL,
106 790524735UL, 821665845UL, 851275425UL, 863017155UL, 909984075UL,
107 936020085UL, 984518535UL, 1017041025UL, 1052416365UL
108 #if (ULONG_MAX > 4294967295)
109 ,1086110025UL, 1110614505UL, 1147371225UL, 1191785595UL, 1213887675UL,
110 1265809545UL, 1282356075UL, 1331995665UL, 1391905515UL, 1450103655UL,
111 1479202725UL, 1547100555UL, 1555088535UL, 1673196525UL, 1712565855UL,
112 1767130365UL, 1830673845UL, 1883166285UL, 1954487535UL, 2001964965UL,
113 2119382265UL, 2187280095UL, 2255177925UL, 2342475135UL, 2390973585UL,
114 2421213795UL, 2555868315UL, 2672264595UL, 2788660875UL, 2856558705UL,
115 2953555605UL, 3050552505UL, 3234846615UL, 3457939485UL, 3516137625UL,
116 3681032355UL, 3758629875UL, 3904125225UL, 4127218095UL, 4360010655UL,
117 4573403835UL, 4796496705UL, 4844995155UL, 5019589575UL, 5203883685UL,
118 5262081825UL, 5465775315UL, 5766465705UL, 5898837945UL, 6164152995UL,
119 6358146795UL, 6411780375UL, 6804332535UL, 6980458485UL, 7172920755UL,
120 7473611145UL, 7716103395UL, 7968295335UL, 8182259085UL, 8342499165UL,
121 8812168365UL, 9023519505UL, 9704539845UL, 9927632715UL, 10373818455UL,
122 10439434005UL, 10820004195UL, 11043097065UL, 11489282805UL,
123 11877270405UL, 12381654285UL, 12604747155UL, 13080031965UL,
124 13274025765UL, 13642613985UL, 14389490115UL, 14583483915UL,
125 15058768725UL, 15611651055UL, 16174233075UL, 16397325945UL,
126 17289697425UL, 17735883165UL, 18143270145UL, 18381678315UL,
127 19074440385UL, 19559424885UL, 20636090475UL, 20941375455UL,
128 21800053275UL, 22643926305UL, 23148310185UL, 24205576395UL,
129 24546777255UL, 25544133615UL, 26389538175UL, 26863291455UL,
130 27813861075UL, 29113619535UL, 29494189725UL, 30520074585UL,
131 30684969315UL, 31790733975UL, 33575476935UL, 34467848415UL,
132 35202742575UL, 36427185795UL, 38037334335UL, 39240095895UL,
133 40365259935UL, 42053005995UL, 43168470345UL, 44953213305UL,
134 45845584785UL, 48522699225UL, 50307442185UL, 51869092275UL,
135 53653835235UL, 54546206715UL, 56680138515UL, 58784971245UL,
136 59386352025UL, 61908271425UL, 63431122755UL, 65700850215UL,
137 67931778915UL, 70162707615UL, 72616729185UL, 74120181135UL,
138 75740029365UL, 78417143805UL, 80871165375UL, 82840202445UL,
139 86448487125UL, 88466022645UL, 91133437395UL, 92918180355UL,
140 100280245065UL, 100726430805UL, 102811864155UL, 106749938295UL,
141 109000266375UL, 113219631525UL, 119689324755UL, 121027881975UL,
142 127943760945UL, 132628711215UL, 134859639915UL, 141775518885UL,
143 148691397855UL, 150922326555UL, 155607276825UL, 161320394235UL,
144 164977177365UL, 171446870595UL, 177470378085UL, 183270792705UL
145 #endif
146 };
147
148 /* All the prime factors that can appear in eulerphi(P) */
149 const unsigned long phiPfactors[] = {2UL, 3UL, 5UL, 7UL, 11UL, 13UL,
150 17UL, 19UL};
151
152
153 /* Some useful PARI functions:
154 sumset(a,b) = {local(i, j, l); l = listcreate (length(a) * length(b)); for (i = 1, length(a), for (j = 1, length(b), listput(l, a[i] + b[j]))); listsort (l, 1); l}
155
156 V(i,X) = { if (i==0, return(2)); if (i==1, return(X)); if(i%2 == 0, return (V (i/2, X)^2-2)); return (V ((i+1)/2, X) * V ((i-1)/2, X) - X)}
157
158 U(i,X) = { if (i==0, return(0)); if (i==1, return(1)); if(i%2 == 0, return (U (i/2, X) * V(i/2,X))); return (V ((i+1)/2, X) *U( (i-1)/2, X) + 1)}
159 */
160
161 #ifndef _OPENMP
omp_get_num_threads()162 static int omp_get_num_threads () {return 1;}
omp_get_thread_num()163 static int omp_get_thread_num () {return 0;}
164 #endif
165
166 static void
167 ntt_sqr_reciprocal (mpzv_t, const mpzv_t, mpzspv_t, const spv_size_t,
168 const mpzspm_t);
169
170 static void
print_elapsed_time(int verbosity,long cpu_start,ATTRIBUTE_UNUSED long real_start)171 print_elapsed_time (int verbosity, long cpu_start,
172 ATTRIBUTE_UNUSED long real_start)
173 {
174 #ifdef _OPENMP
175 if (real_start != 0L)
176 {
177 outputf (verbosity, " took %lums (%lums real)\n",
178 elltime (cpu_start, cputime()),
179 elltime (real_start, realtime()));
180 return;
181 }
182 #endif
183 outputf (verbosity, " took %lums\n", elltime (cpu_start, cputime()));
184 }
185
186
187 static void
print_CRT_primes(const int verbosity,const char * prefix,const mpzspm_t ntt_context)188 print_CRT_primes (const int verbosity, const char *prefix,
189 const mpzspm_t ntt_context)
190 {
191 double modbits = 0.;
192 unsigned int i;
193
194 if (test_verbose (verbosity))
195 {
196 outputf (verbosity, "%s%lu", prefix, ntt_context->spm[0]->sp);
197 modbits += log ((double) ntt_context->spm[0]->sp);
198 for (i = 1; i < ntt_context->sp_num; i++)
199 {
200 outputf (verbosity, " * %lu", ntt_context->spm[i]->sp);
201 modbits += log ((double) ntt_context->spm[i]->sp);
202 }
203 outputf (verbosity, ", has %d primes, %f bits\n",
204 ntt_context->sp_num, modbits / log (2.));
205 }
206 }
207
208 /* Approximate amount of memory in bytes each coefficient in an NTT takes
209 so that NTT can do transforms up to length lmax with modulus, or
210 with 2*modulus if twice != 0 */
211 static size_t
ntt_coeff_mem(const unsigned long lmax,const mpz_t modulus,const int twice)212 ntt_coeff_mem (const unsigned long lmax, const mpz_t modulus, const int twice)
213 {
214 mpz_t t;
215 size_t n;
216
217 mpz_init (t);
218 mpz_mul (t, modulus, modulus);
219 mpz_mul_ui (t, t, lmax);
220 if (twice)
221 mpz_mul_2exp (t, t, 1UL);
222 /* +4: +1 for rounding up, +3 for extra words due to ECRT */
223 n = (mpz_sizeinbase (t, 2) - 1) / SP_NUMB_BITS + 4;
224 mpz_clear (t);
225 return n * sizeof (sp_t);
226 }
227
228 size_t
pm1fs2_memory_use(const unsigned long lmax,const mpz_t modulus,const int use_ntt)229 pm1fs2_memory_use (const unsigned long lmax, const mpz_t modulus,
230 const int use_ntt)
231 {
232 if (use_ntt)
233 {
234 /* We store lmax / 2 + 1 coefficients for the DCT-I of F and lmax
235 coefficients for G in NTT ready format. Each coefficient in
236 NTT-ready format occupies approx.
237 ceil(log(lmax*modulus^2)/log(bits per sp_t)) + 3 words. */
238
239 size_t n;
240
241 n = ntt_coeff_mem (lmax, modulus, 0) * (size_t) (3 * lmax / 2 + 1);
242 outputf (OUTPUT_DEVVERBOSE, "pm1fs2_memory_use: Estimated memory use "
243 "with lmax = %lu NTT is %lu bytes\n", lmax, n);
244 return n;
245 }
246 else
247 {
248 /* F stores s_1/2 residues,
249 h stores s_1 mpz_t structs (residues get cloned from F)
250 g stores lmax residues,
251 R stores lmax-s_1 residues,
252 and tmp stores 3*lmax+list_mul_mem (lmax / 2) residues.
253 Assume s_1 is close to lmax/2.
254 Then we have
255 lmax/4 + lmax/2 + lmax + lmax/2 + 3*lmax + list_mul_mem (lmax / 2)
256 = (5+1/4)*lmax + list_mul_mem (lmax / 2) residues, plus s_1 mpz_t.
257 */
258
259 size_t n;
260
261 n = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
262 n *= 5 * lmax + lmax / 4 + list_mul_mem (lmax / 2);
263 n += lmax / 2 * sizeof (mpz_t);
264 /* Memory use due to temp space allocation in TMulKS appears to
265 approximately triple the estimated memory use. This is hard to
266 estimate precisely, so let's go with the fudge factor of 3 here */
267 n *= 3;
268 outputf (OUTPUT_DEVVERBOSE, "pm1fs2_memory_use: Estimated memory use "
269 "with lmax = %lu is %lu bytes\n", lmax, n);
270 return n;
271 }
272 }
273
274 /* return the possible lmax for given memory use and modulus */
275
276 unsigned long
pm1fs2_maxlen(const size_t memory,const mpz_t modulus,const int use_ntt)277 pm1fs2_maxlen (const size_t memory, const mpz_t modulus, const int use_ntt)
278 {
279 if (use_ntt)
280 {
281 size_t n, lmax = 1;
282
283 n = ntt_coeff_mem (lmax, modulus, 0);
284 lmax = 1UL << ceil_log2 (memory / n / 3);
285 return lmax;
286 }
287 else
288 {
289 size_t lmax, n;
290
291 n = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
292
293 /* Guess an initial value of lmax for list_mul_mem (lmax / 2) */
294 /* memory = n * 25/4 * lmax + lmax / 2 * sizeof (mpz_t); */
295 /* Fudge factor of 3 for TMulKS as above */
296 lmax = memory / (3 * 25 * n / 4 + 3 * sizeof (mpz_t) / 2);
297 return lmax;
298 }
299 }
300
301 size_t
pp1fs2_memory_use(const unsigned long lmax,const mpz_t modulus,const int use_ntt,const int twopass)302 pp1fs2_memory_use (const unsigned long lmax, const mpz_t modulus,
303 const int use_ntt, const int twopass)
304 {
305 size_t n, m;
306
307 m = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
308 if (use_ntt)
309 {
310 /* In one pass mode, we store h_x_ntt and h_y_ntt, each of length
311 lmax/2(+1), and g_x_ntt and g_y_ntt, each of length lmax, all in
312 NTT ready format. In two pass mode, we store h_x_ntt, h_y_ntt and
313 g_x_ntt as before, plus R which is lmax - s_1 mpz_t.
314 We assume s_1 ~= lmax/2.
315 */
316
317 n = ntt_coeff_mem (lmax, modulus, !twopass);
318 if (twopass)
319 return lmax * (2 * n + m / 2);
320 else
321 return lmax * 3 * n;
322 }
323 else
324 {
325 /* We allocate:
326 F: s_1/2 coefficients
327 fh_x, fh_y: s_1/2 coefficients
328 h_x, h_y: s_1 mpz_t's (cloned from fh_x and fh_y)
329 g_x, g_y: lmax coefficients
330 R_x, R_y: lmax - s_1 coefficients
331 tmp: 3UL * lmax + list_mul_mem (lmax / 2)
332 Assuming s_1 ~ lmax/2, that's
333 lmax/2 + 2*lmax/4 + 2*lmax + 2*lmax/2 * 3*lmax +
334 list_mul_mem (lmax / 2) =
335 7 + list_mul_mem (lmax / 2) coefficients and lmax mpz_t.
336 */
337
338 n = m * (7 * lmax + list_mul_mem (lmax / 2));
339 n += lmax * sizeof (mpz_t);
340 n = 5 * n / 2; /* A fudge factor again */
341 return n;
342 }
343 }
344
345 unsigned long
pp1fs2_maxlen(const size_t memory,const mpz_t modulus,const int use_ntt,const int twopass)346 pp1fs2_maxlen (const size_t memory, const mpz_t modulus, const int use_ntt,
347 const int twopass)
348 {
349 size_t n, m;
350
351 m = mpz_size (modulus) * sizeof (mp_limb_t) + sizeof (mpz_t);
352 if (use_ntt)
353 {
354 n = ntt_coeff_mem (1, modulus, !twopass);
355 if (twopass)
356 n = memory / (2 * n + m / 2);
357 else
358 n = memory / (3 * n);
359 return 1UL << (ceil_log2 (n / 2)); /* Rounded down to power of 2 */
360 }
361 else
362 {
363 return memory / 5 / (m * 8 + sizeof (mpz_t)) * 2;
364 }
365 }
366
367
368 /* Test if for given P, nr, B2min and B2 we can choose an m_1 so that the
369 stage 2 interval [B2min, B2] is covered. The effective B2min and B2
370 are stored in effB2min and effB2 */
371
372 static int
test_P(const mpz_t B2min,const mpz_t B2,mpz_t m_1,const unsigned long P,const unsigned long nr,mpz_t effB2min,mpz_t effB2)373 test_P (const mpz_t B2min, const mpz_t B2, mpz_t m_1, const unsigned long P,
374 const unsigned long nr, mpz_t effB2min, mpz_t effB2)
375 {
376 mpz_t m;
377 /* We need B2min >= 2 * max(S_1 + S_2) + (2*m_1 - 1)*P + 1, or
378 B2min - 2 * max(S_1 + S_2) - 1 >= (2*m_1)*P - P, or
379 (B2min - 2*max(S_1 + S_2) + P - 1)/(2P) >= m_1
380 Choose m_1 accordingly */
381
382 mpz_init (m);
383 sets_max (m, P);
384 mpz_mul_2exp (m, m, 1UL); /* m = 2*max(S_1 + S_2) */
385
386 mpz_sub (m_1, B2min, m);
387 mpz_sub_ui (m_1, m_1, 1UL); /* m_1 = B2min - 2*max(S_1 + S_2) - 1 */
388 mpz_add_ui (m_1, m_1, P);
389 mpz_fdiv_q_2exp (m_1, m_1, 1UL);
390 mpz_fdiv_q_ui (m_1, m_1, P); /* 2UL*P may overflow */
391
392 /* Compute effB2min = 2 * max(S_1 + S_2) + (2*(m_1 - 1) + 1)*P + 1 */
393
394 mpz_mul_2exp (effB2min, m_1, 1UL);
395 mpz_sub_ui (effB2min, effB2min, 1UL);
396 mpz_mul_ui (effB2min, effB2min, P);
397 mpz_add (effB2min, effB2min, m);
398 mpz_add_ui (effB2min, effB2min, 1UL);
399 ASSERT_ALWAYS (mpz_cmp (effB2min, B2min) <= 0);
400
401 /* Compute the smallest value coprime to P at the high end of the stage 2
402 interval that will not be covered:
403 2*(min(S_1 + S_2)) + (2*(m_1 + nr) + 1)*P.
404 We assume min(S_1 + S_2) = -max(S_1 + S_2) */
405 mpz_add_ui (effB2, m_1, nr);
406 mpz_mul_2exp (effB2, effB2, 1UL);
407 mpz_add_ui (effB2, effB2, 1UL);
408 mpz_mul_ui (effB2, effB2, P);
409 mpz_sub (effB2, effB2, m);
410
411 /* The effective B2 values is that value, minus 1 */
412 mpz_sub_ui (effB2, effB2, 1UL);
413
414 mpz_clear (m);
415 return (mpz_cmp (B2, effB2) <= 0);
416 }
417
418
419 static void
factor_phiP(int * exponents,const unsigned long phiP)420 factor_phiP (int *exponents, const unsigned long phiP)
421 {
422 const int nrprimes = sizeof (phiPfactors) / sizeof (unsigned long);
423 unsigned long cofactor = phiP;
424 int i;
425
426 ASSERT_ALWAYS (phiP > 0UL);
427
428 for (i = 0; i < nrprimes; i++)
429 for (exponents[i] = 0; cofactor % phiPfactors[i] == 0UL; exponents[i]++)
430 cofactor /= phiPfactors[i];
431
432 ASSERT_ALWAYS (cofactor == 1UL);
433 }
434
435
436 static unsigned long
pow_ul(const unsigned long b,const unsigned int e)437 pow_ul (const unsigned long b, const unsigned int e)
438 {
439 unsigned long r = 1UL;
440 unsigned int i;
441
442 for (i = 0; i < e; i++)
443 r *= b;
444
445 return r;
446 }
447
448 static unsigned long
absdiff_ul(unsigned long a,unsigned long b)449 absdiff_ul (unsigned long a, unsigned long b)
450 {
451 return (a > b) ? a - b : b - a;
452 }
453
454 /* Choose s_1 so that s_1 * s_2 = phiP, s_1 is positive and even,
455 s_2 >= min_s2 and s_2 is minimal and abs(s_1 - l) is minimal
456 under those conditions. If use_ntt == 1, we require s_1 < l.
457 Returns 0 if no such choice is possible */
458
459 static unsigned long
choose_s_1(const unsigned long phiP,const unsigned long min_s2,const unsigned long l,const int use_ntt)460 choose_s_1 (const unsigned long phiP, const unsigned long min_s2,
461 const unsigned long l, const int use_ntt)
462 {
463 const int nrprimes = sizeof (phiPfactors) / sizeof (unsigned long);
464 /* Using [nrprimes] here makes the compiler complain about variable-sized
465 arrays */
466 int phiPexponents[sizeof (phiPfactors) / sizeof (unsigned long)],
467 exponents[sizeof (phiPfactors) / sizeof (unsigned long)];
468 unsigned long s_1 = 0UL, s_2 = 0UL, trys_1;
469 int i;
470
471 ASSERT_ALWAYS (phiP > 0 && phiP % 2 == 0);
472
473 /* We want only even s_1. We divide one 2 out of phiP here... */
474 factor_phiP (phiPexponents, phiP / 2);
475 for (i = 0; i < nrprimes; i++)
476 exponents[i] = 0;
477
478 do {
479 trys_1 = 2; /* ... and add a 2 here */
480 for (i = 0; i < nrprimes; i++)
481 trys_1 *= pow_ul (phiPfactors[i], exponents[i]);
482 #if 0
483 printf ("choose_s_1: Trying trys_1 = %lu\n", trys_1);
484 #endif
485 /* See if it satisfies all the required conditions and is an
486 improvement over the previous choice */
487 if (phiP / trys_1 >= min_s2 &&
488 (s_2 == 0UL || phiP / trys_1 < s_2) &&
489 absdiff_ul (trys_1, l) < absdiff_ul (s_1, l) &&
490 (use_ntt == 0 || trys_1 < l))
491 {
492 #if 0
493 printf ("choose_s_1: New best s_1 for phiP = %lu, min_s2 = %lu, "
494 "l = %lu : %lu\n", phiP, min_s2, l, trys_1);
495 #endif
496 s_1 = trys_1;
497 }
498 for (i = 0; i < nrprimes; i++)
499 {
500 if (++(exponents[i]) <= phiPexponents[i])
501 break;
502 exponents[i] = 0;
503 }
504 } while (i < nrprimes);
505
506 return s_1;
507 }
508
509
510 /* Approximate cost of stage 2. Cost with and without ntt are not
511 comparable. We have l > s_1 and s_1 * s_2 = eulerphi(P), hence
512 s_2*l > eulerphi(P) and so cost (s_2, l) > eulerphi(P) for all P */
513 static unsigned long
est_cost(const unsigned long s_2,const unsigned long l,const int use_ntt,const int method)514 est_cost (const unsigned long s_2, const unsigned long l, const int use_ntt,
515 const int method)
516 {
517 if (method == ECM_PM1)
518 {
519 /* The time for building f, h and DCT-I of h seems to be about
520 7/6 of the time of computing g, h*g and gcd with NTT, and
521 3/2 of the time of computing g, h*g and gcd without NTT */
522
523 if (use_ntt)
524 return (7 * l) / 6 + s_2 * l;
525 else
526 return (3 * l) / 2 + s_2 * l;
527 }
528 else if (method == ECM_PP1)
529 {
530 /* Building f is the same, building h and its forward transform is
531 twice about as expensive as for P-1. Each multi-point evaluation
532 is twice as expensive as for P-1.
533 FIXME: The estimate for NTT assumes the "one-pass" variant, in
534 "two-pass" the multipoint evaluations are slower, so the optimum
535 shifts towards smaller s_2 some more */
536 if (use_ntt)
537 return (4 * l) / 5 + s_2 * l;
538 else
539 return (3 * l) / 4 + s_2 * l;
540 }
541 else
542 abort (); /* Invalid value for method */
543 }
544
545 /* Choose P so that a stage 2 range from B2min to B2 can be covered with
546 multipoint evaluations, each using a convolution of length at most lmax.
547 The parameters for stage 2 are stored in finalparams, the final effective
548 B2min and B2 values in final_B2min and final_B2, respecively. Each of these
549 may be NULL, in which case the value is not stored. It is permissible
550 to let B2min and final_B2min, or B2 and final_B2 point at the same mpz_t. */
551
552 long
choose_P(const mpz_t B2min,const mpz_t B2,const unsigned long lmax,const unsigned long min_s2,faststage2_param_t * finalparams,mpz_t final_B2min,mpz_t final_B2,const int use_ntt,const int method)553 choose_P (const mpz_t B2min, const mpz_t B2, const unsigned long lmax,
554 const unsigned long min_s2, faststage2_param_t *finalparams,
555 mpz_t final_B2min, mpz_t final_B2, const int use_ntt,
556 const int method)
557 {
558 /* Let S_1 + S_2 == (Z/PZ)* (mod P).
559
560 Let F(x) = \prod_{k_1 \in S_1} (x - b_1^{2 k_1}).
561
562 If we evaluate F(b_1^{2 k_2 + (2m + 1)P}) for all k_2 \in S_2 with
563 m_1 <= m < m_1+nr, we test all exponents 2 k_2 + (2m + 1)P - 2 k_1.
564 The largest value coprime to P at the low end of the stage 2 interval
565 *not* covered will be
566 2*max(S_2) + (2*(m_1-1) + 1)*P - 2*min(S_1).
567 The smallest value at the high end not covered will be
568 2*min(S_2) + (2*(m_1 + nr) + 1)*P - 2*max(S_1).
569 Assume S_1 and S_2 are symmetric around 0, so that max(S_1) = -min(S_1).
570 Then the largest ... is:
571 2*(max(S_1) + max(S_2)) + (2*m_1 - 1)*P
572 The smallest ... is:
573 -2*(max(S_1) + max(S_2)) + (2*m_1 + 2*nr + 1)*P
574 The effective B2min = 2*(max(S_1) + max(S_2)) + (2*m_1 - 1)*P + 1
575 The effective B2max = -2*(max(S_1) + max(S_2)) + (2*m_1 + 2*nr + 1)*P - 1
576
577 Then the difference effB2max - effB2min =
578 -4*(max(S_1) + max(S_2)) + 2P*(nr + 1) - 2
579
580 We obviously require B2max - B2min <= 2*nr*P
581 Since nr < lmax, B2max - B2min <= 2*lmax*P or
582 P >= ceil((B2max - B2min)/(2*lmax))
583
584 Hence we are looking for an odd P with s_1 * s_2 = eulerphi(P) so that
585 s_1 ~= lmax / 2 and the whole stage 2 interval is covered. s_2 should
586 be small, as long as s_1 is small enough.
587 */
588
589 mpz_t B2l, m_1, effB2min, tryeffB2, effB2, lmin;
590 /* The best parameters found so far, P == 0 means that no suitable P
591 has been found yet: */
592 unsigned long P = 0, s_1 = 0, s_2 = 0, l = 0, cost = 0;
593 unsigned int i;
594 const unsigned int Pvalues_len = sizeof (Pvalues) / sizeof (unsigned long);
595 int r;
596
597 outputf (OUTPUT_TRACE,
598 "choose_P(B2min = %Zd, B2 = %Zd, lmax = %lu, min_s2 = %ld, "
599 "use_ntt = %d, method = %d\n",
600 B2min, B2, lmax, min_s2, use_ntt, method);
601
602 if (mpz_cmp (B2, B2min) < 0)
603 return 0L;
604
605 /* If we use the NTT, we allow only power-of-two transform lengths.
606 In that case, the code below assumes that lmax is a power of two.
607 If that is not the case, print error and return. */
608 if (use_ntt && (lmax & (lmax - 1UL)) != 0)
609 {
610 outputf (OUTPUT_ERROR,
611 "choose_P: Error, lmax = %lu is not a power of two\n", lmax);
612 return ECM_ERROR;
613 }
614
615 mpz_init (effB2);
616 mpz_init (tryeffB2);
617 mpz_init (effB2min);
618 mpz_init (B2l);
619 mpz_init (m_1);
620 mpz_init (lmin);
621
622 mpz_sub (B2l, B2, B2min);
623 mpz_add_ui (B2l, B2l, 1UL); /* +1 due to closed interval */
624
625 /* For each candidate P, check if [B2min, B2] can be covered at all,
626 and if so, what the best parameters (minimizing the cost, maximizing
627 effB2) are. If they are better than the best parameters for the best P
628 so far, remember them. */
629
630 for (i = 0 ; i < Pvalues_len; i++)
631 {
632 unsigned long tryP, tryphiP, trys_1, trys_2, tryl, trycost;
633
634 tryP = Pvalues[i];
635 tryphiP = eulerphi (tryP);
636
637 outputf (OUTPUT_TRACE,
638 "choose_P: trying P = %lu, eulerphi(P) = %lu\n", tryP, tryphiP);
639
640 /* If we have a good P already and this tryphiP >= cost, then
641 there's no hope for this tryP, since cost(s_2, l) > eulerphi(P) */
642 if (P != 0 && tryphiP >= cost)
643 {
644 outputf (OUTPUT_TRACE,
645 "choose_P: tryphiP > cost = %lu, this P is too large\n",
646 cost);
647 continue;
648 }
649
650 /* We have nr < l and effB2-effB2min <= 2*nr*P. Hence we need
651 l >= B2l/P/2 */
652 mpz_cdiv_q_ui (lmin, B2l, tryP);
653 mpz_cdiv_q_2exp (lmin, lmin, 1UL);
654 outputf (OUTPUT_TRACE, "choose_P: lmin = %Zd for P = %lu\n", lmin, tryP);
655 if (mpz_cmp_ui (lmin, lmax) > 0)
656 {
657 outputf (OUTPUT_TRACE,
658 "choose_P: lmin > lmax, this P is too small\n");
659 continue;
660 }
661
662 /* Try all possible transform lengths and store parameters in
663 P, s_1, s_2, l if they are better than the previously best ones */
664
665 /* Keep reducing tryl to find best parameters. For NTT, we only have
666 power of 2 lengths so far, so we can simply divide by 2.
667 For non-NTT, we have arbitrary transform lengths so we can decrease
668 in smaller steps... let's say by, umm, 25% each time? */
669 for (tryl = lmax; mpz_cmp_ui (lmin, tryl) <= 0;
670 tryl = (use_ntt) ? tryl / 2 : 3 * tryl / 4)
671 {
672 trys_1 = choose_s_1 (tryphiP, min_s2, tryl / 2, use_ntt);
673 if (trys_1 == 0)
674 {
675 outputf (OUTPUT_TRACE,
676 "choose_P: could not choose s_1 for P = %lu, l = %lu\n",
677 tryP, tryl);
678 continue;
679 }
680 ASSERT (tryphiP % trys_1 == 0UL);
681 trys_2 = tryphiP / trys_1;
682 outputf (OUTPUT_TRACE, "choose_P: chose s_1 = %lu, k = s_2 = %lu "
683 "for P = %lu, l = %lu\n", trys_1, trys_2, tryP, tryl);
684
685 if (test_P (B2min, B2, m_1, tryP, tryl - trys_1, effB2min, tryeffB2))
686 {
687 outputf (OUTPUT_TRACE,
688 "choose_P: P = %lu, l = %lu, s_1 = %lu, k = s_2 = %lu "
689 "works, m_1 = %Zd, effB2min = %Zd, effB2 = %zZd\n",
690 tryP, tryl, trys_1, trys_2, m_1, effB2min, tryeffB2);
691
692 /* do not allow B2min to be less than 0 */
693 if (mpz_cmp_ui(effB2min, 0) < 0)
694 continue;
695
696 /* We use these parameters if we
697 1. didn't have any suitable ones yet, or
698 2. these cover [B2min, B2] and are cheaper than the best
699 ones so far, or
700 3. they are as expensive but reach greater effB2. */
701 trycost = est_cost (trys_2, tryl, use_ntt, method);
702 ASSERT (tryphiP < trycost);
703 if (P == 0 || trycost < cost ||
704 (trycost == cost && mpz_cmp (tryeffB2, effB2) > 0))
705 {
706 outputf (OUTPUT_TRACE,
707 "choose_P: and is the new optimum (cost = %lu)\n",
708 trycost);
709 P = tryP;
710 s_1 = trys_1;
711 s_2 = trys_2;
712 l = tryl;
713 cost = trycost;
714 mpz_set (effB2, tryeffB2);
715 }
716 }
717 }
718 }
719
720 if (P != 0) /* If we found a suitable P */
721 {
722 /* Compute m_1, effB2min, effB2 again */
723 r = test_P (B2min, B2, m_1, P, l - s_1, effB2min, effB2);
724 ASSERT_ALWAYS(r != 0);
725 if (finalparams != NULL)
726 {
727 finalparams->P = P;
728 finalparams->s_1 = s_1;
729 finalparams->s_2 = s_2;
730 finalparams->l = l;
731 mpz_set (finalparams->m_1, m_1);
732 }
733 if (final_B2min != NULL)
734 mpz_set (final_B2min, effB2min);
735 if (final_B2 != NULL)
736 mpz_set (final_B2, effB2);
737 }
738
739 mpz_clear (effB2);
740 mpz_clear (tryeffB2);
741 mpz_clear (effB2min);
742 mpz_clear (B2l);
743 mpz_clear (m_1);
744 mpz_clear (lmin);
745
746 return (P != 0) ? (long) P : ECM_ERROR;
747 }
748
749
750
751 static void
list_output_poly(listz_t l,unsigned long len,int monic,int symmetric,char * prefix,char * suffix,int verbosity)752 list_output_poly (listz_t l, unsigned long len, int monic, int symmetric,
753 char *prefix, char *suffix, int verbosity)
754 {
755 unsigned long i;
756
757 if (prefix != NULL)
758 outputf (verbosity, prefix);
759
760 if (len == 0)
761 {
762 if (monic)
763 outputf (verbosity, "1\n", len, len);
764 else
765 outputf (verbosity, "0\n", len);
766 return;
767 }
768
769 if (monic)
770 {
771 if (symmetric)
772 outputf (verbosity, "(x^%lu + x^-%lu) + ", len, len);
773 else
774 outputf (verbosity, "x^%lu + ", len);
775 }
776 for (i = len - 1; i > 0; i--)
777 if (symmetric)
778 outputf (verbosity, "%Zd * (x^%lu + x^-%lu) + ", l[i], i, i);
779 else
780 outputf (verbosity, "%Zd * x^%lu + ", l[i], i);
781 outputf (verbosity, "%Zd", l[0]);
782 if (suffix != NULL)
783 outputf (verbosity, suffix);
784 }
785
786 #if 0
787 /* Multiply P[i] by r^{k(deg-i)}, for 0 <= i <= deg. Needs 3 entries in tmp. */
788 /* I.e., let P(x) = x^deg + \sum_{i=0}^{deg - 1} P[i] * x^i. The output is
789 R(x) = x^deg + \sum_{i=0}^{deg - 1} R[i] * x^i = r^(k deg) P(r^{-k} x). */
790 /* The input and output polynomials are monic and have the leading monomial
791 implicit, i.e. not actually stored in the array of coefficients. */
792 /* Returns 0 if a modular inversion failed (in which case R is left
793 unchanged), 1 otherwise */
794
795 static int
796 list_scale_rev (listz_t R, listz_t S, mpz_t r, long k, unsigned long deg,
797 mpz_t modulus, listz_t tmp,
798 ATTRIBUTE_UNUSED const unsigned long tmplen)
799 {
800 unsigned long i;
801
802 ASSERT (tmplen >= 3);
803 mpz_powm_ui (tmp[0], r, (unsigned long) labs (k), modulus);
804 if (k < 0)
805 {
806 if (!mpz_invert (tmp[0], tmp[0], modulus)) /* FIXME: get rid of this! */
807 return 0;
808 }
809 /* Here, tmp[0] = r^k */
810 mpz_set (tmp[1], tmp[0]);
811 /* mpz_set (R[deg], S[deg]); Leading monomial is not stored! */
812 for (i = 1; i + 1 <= deg; i++)
813 {
814 /* Here, tmp[1] = r^(ki) */
815 mpz_mul (tmp[2], S[deg-i], tmp[1]);
816 mpz_mod (R[deg-i], tmp[2], modulus);
817 mpz_mul (tmp[2], tmp[1], tmp[0]); /* FIXME, avoid unnecessary mul */
818 mpz_mod (tmp[1], tmp[2], modulus); /* at end of loop */
819 }
820 if (i <= deg)
821 {
822 mpz_mul (tmp[2], S[deg-i], tmp[1]);
823 mpz_mod (R[deg-i], tmp[2], modulus);
824 }
825
826 return 1;
827 }
828 #endif
829
830 /* Same, but does squaring which makes things easier */
831
832 static void
list_sqr_reciprocal(listz_t R,listz_t S,const unsigned long l,mpz_t modulus,listz_t tmp,ATTRIBUTE_UNUSED const unsigned long tmplen)833 list_sqr_reciprocal (listz_t R, listz_t S, const unsigned long l,
834 mpz_t modulus, listz_t tmp,
835 ATTRIBUTE_UNUSED const unsigned long tmplen)
836 {
837 unsigned long i;
838 listz_t Srev, r1 = tmp, r2 = tmp + 2 * l - 1, t = tmp + 4 * l - 2;
839
840 if (l == 0UL)
841 return;
842
843 /* FIXME: This modifies the input arguments. */
844 /* We have to divide S[0] by 2 */
845
846 ASSERT (tmplen >= 4 * l - 2 + list_mul_mem (l));
847
848 #if 0
849 gmp_printf ("/* list_sqr_reciprocal */ S(x) = %Zd", S[0]);
850 for (i = 1; i < l1; i++)
851 gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S[i], i, i);
852 gmp_printf ("\n");
853 #endif
854
855 if (mpz_odd_p (S[0]))
856 {
857 ASSERT_ALWAYS (mpz_odd_p (modulus));
858 mpz_add (S[0], S[0], modulus);
859 }
860 mpz_tdiv_q_2exp (S[0], S[0], 1UL);
861
862 list_mul (r1, S, l, S, l, 0, t);
863 /* r1 = f0*g0/4 + (f0*g1 + f1*g0)/2 * x + f1*g1 * x^2 */
864 #if 0
865 for (i = 0; i < 2UL * l - 1UL; i++)
866 gmp_printf ("list_sqr_reciprocal: r1[%lu] = %Zd\n", i, r1[i]);
867 #endif
868
869 Srev = (listz_t) malloc (l * sizeof (mpz_t));
870 ASSERT_ALWAYS (Srev != NULL);
871 for (i = 0UL; i < l; i++)
872 (*Srev)[i] = (*S)[l - 1UL - i];
873 list_mul (r2, S, l, Srev, l, 0, t);
874 /* r2 is symmetric, r2[i] = r2[2*l - 2 - i]. Check this */
875 #if 0
876 for (i = 0; 0 && i < 2UL * l - 1UL; i++)
877 gmp_printf ("list_sqr_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
878 #endif
879 #ifdef WANT_ASSERT
880 for (i = 0UL; i < l; i++)
881 ASSERT (mpz_cmp (r2[i], r2[2UL * l - 2UL - i]) == 0);
882 #endif
883 free (Srev);
884 /* r2 = g1*f0/2 + (g0*f0/4 + g1*f1) * x + g0*f1/2 * x^2 */
885 #if 0
886 for (i = 0; i < 2UL * l - 1UL; i++)
887 gmp_printf ("list_sqr_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
888 #endif
889
890 mpz_mul_2exp (r1[0], r1[0], 1UL);
891 /* r1 = f0*g0/2 + (f0*g1 + f1*g0)/2 * x + f1*g1 * x^2 */
892 for (i = 0UL; i < l; i++)
893 {
894 mpz_mul_2exp (r2[l - i - 1UL], r2[l - i - 1UL], 1UL);
895 mpz_add (R[i], r1[i], r2[l - i - 1UL]);
896 }
897 /* r1 = 3/4*f0*g0 + g1*f1 + (f0*g1 + 2*f1*g0)/2 * x + f1*g1 * x^2 */
898 /* r1 = f0*g0 + 2*g1*f1 + (f0*g1 + f1*g0) * x + f1*g1 * x^2 */
899 for (i = l; i < 2UL * l - 1UL; i++)
900 mpz_set (R[i], r1[i]);
901
902 if (R != S)
903 mpz_mul_2exp (S[0], S[0], 1UL);
904
905 #if 0
906 for (i = 0; i < 2UL * l; i++)
907 gmp_printf ("list_sqr_reciprocal: R[%lu] = %Zd\n", i, R[i]);
908 #endif
909 }
910
911 #ifdef WANT_ASSERT
912 static void
list_recip_eval1(mpz_t R,const listz_t S,const unsigned long l)913 list_recip_eval1 (mpz_t R, const listz_t S, const unsigned long l)
914 {
915 unsigned long i;
916
917 mpz_set_ui (R, 0UL);
918 for (i = 1; i < l; i++)
919 mpz_add (R, R, S[i]);
920 mpz_mul_2exp (R, R, 1UL);
921 if (l > 0UL)
922 mpz_add (R, R, S[0]);
923 }
924 #endif
925
926 /* Multiply two reciprocal polynomials of degree 2*l1-2 and 2*l2-2, resp.,
927 with coefficients in standard basis
928
929 S_1(x) = S1[0] + sum_{1 \leq i \leq l1 - 1} S1[i] (x^i + x^{-i})
930 S_2(x) = S2[0] + sum_{1 \leq i \leq l2 - 1} S2[i] (x^i + x^{-i})
931
932 to the reciprocal polynomial of degree 2*(l1 + l2) - 4
933
934 R(x) = R[0] + sum_{1 \leq i \leq l1 + l2 - 2} R[i] (x^i + x^{-i})
935 = S_1(x) * S_2(x)
936
937 R == S1 == S2 is permissible, however if S1 == S2, l1 must be equal
938 to l2 (i.e. the multiplication must be a squaring)
939 */
940 /* FIXME: This modifies the input arguments. */
941 /* We have to divide S1[0] and S2[0] by 2 */
942
943 static void
list_mul_reciprocal(listz_t R,listz_t S1,unsigned long l1,listz_t S2,unsigned long l2,mpz_t modulus,listz_t tmp,ATTRIBUTE_UNUSED const unsigned long tmplen)944 list_mul_reciprocal (listz_t R, listz_t S1, unsigned long l1,
945 listz_t S2, unsigned long l2,
946 mpz_t modulus, listz_t tmp,
947 ATTRIBUTE_UNUSED const unsigned long tmplen)
948 {
949 unsigned long i;
950 const unsigned long lmax = MAX(l1, l2);
951 listz_t r1 = tmp, r2 = tmp + 2*lmax - 1, rev = tmp + 4*lmax - 2,
952 t = tmp + 6*lmax - 3;
953 #ifdef WANT_ASSERT
954 mpz_t sum1, sum2, prod;
955 #endif
956
957 ASSERT (S1 < tmp || S1 >= tmp + tmplen);
958 ASSERT (S2 < tmp || S2 >= tmp + tmplen);
959 ASSERT (R < tmp || R >= tmp + tmplen);
960
961 if (l1 == 0UL || l2 == 0UL)
962 return;
963
964 if (S1 == S2)
965 {
966 ASSERT_ALWAYS (l1 == l2);
967 list_sqr_reciprocal (R, S1, l1, modulus, tmp, tmplen);
968 return;
969 }
970
971 ASSERT (tmplen >= 6*lmax - 3 + list_mul_mem (lmax));
972 #ifdef WANT_ASSERT
973 mpz_init (sum1);
974 mpz_init (sum2);
975 mpz_init (prod);
976 list_recip_eval1 (sum1, S1, l1);
977 list_recip_eval1 (sum2, S2, l2);
978 mpz_mul (prod, sum1, sum2);
979 mpz_mod (prod, prod, modulus);
980 #endif
981
982
983 /* Make S1 the longer of the two, i.e. l1 >= l2 */
984 if (l2 > l1)
985 {
986 listz_t St = S1;
987 unsigned long lt = l1;
988 S1 = S2;
989 S2 = St;
990 l1 = l2;
991 l2 = lt;
992 }
993
994 #if 0
995 gmp_printf ("/* list_mul_reciprocal */ S1(x) = %Zd", S1[0]);
996 for (i = 1; i < l1; i++)
997 gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S1[i], i, i);
998 gmp_printf ("\n");
999 gmp_printf ("/* list_mul_reciprocal */ S2(x) = %Zd", S2[0]);
1000 for (i = 1; i < l1; i++)
1001 gmp_printf (" + %Zd * (x^%lu + 1/x^%lu)", S2[i], i, i);
1002 gmp_printf ("\n");
1003 #endif
1004
1005 /* Divide S1[0] and S2[0] by 2 */
1006 if (mpz_odd_p (S1[0]))
1007 {
1008 ASSERT_ALWAYS (mpz_odd_p (modulus));
1009 mpz_add (S1[0], S1[0], modulus);
1010 }
1011 mpz_tdiv_q_2exp (S1[0], S1[0], 1UL);
1012
1013 if (mpz_odd_p (S2[0]))
1014 {
1015 ASSERT_ALWAYS (mpz_odd_p (modulus));
1016 mpz_add (S2[0], S2[0], modulus);
1017 }
1018 mpz_tdiv_q_2exp (S2[0], S2[0], 1UL);
1019
1020 /* Pad rev with zeros */
1021 for (i = l2; i < lmax; i++)
1022 mpz_set_ui (rev[i], 0UL);
1023
1024 for (i = 0UL; i < l2; i++)
1025 mpz_set (rev[i], S2[l2 - 1UL - i]);
1026 list_mul (r1, S1, lmax, rev, lmax, 0, t);
1027 /* r1 = \tilde{f}(x) \rev(\tilde{g}(x)) and has degree l1 + l2 - 2,
1028 i.e. l1 + l2 - 1 entries. */
1029 #if 0
1030 for (i = 0; i < 2 * lmax - 1; i++)
1031 gmp_printf ("list_mul_reciprocal: r1[%lu] = %Zd\n", i, r1[i]);
1032 #endif
1033
1034 for (i = 0UL; i < l2; i++)
1035 mpz_set(rev[i], S2[i]);
1036 list_mul (r2, S1, lmax, rev, lmax, 0, t);
1037 /* \tilde{f}(x) \tilde{g}(x) */
1038
1039 #if 0
1040 for (i = 0; i < 2 * lmax - 1; i++)
1041 gmp_printf ("list_mul_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
1042 #endif
1043
1044 /* Add f_0*g_0 by doubling the f_0*g_0 term in r2 */
1045 mpz_mul_2exp (r2[0], r2[0], 1UL);
1046
1047 /* Add \flloor x^{-d_g} \tilde{f}(x) \rev(\tilde{g}(x)) \rfloor.
1048 d_g = l2 - 1. */
1049 for (i = 0; i < l1; i++)
1050 mpz_add (r2[i], r2[i], r1[i + l2 - 1]);
1051
1052 /* Add \floor x^{-d_f} rev(\tilde{f}(x) \rev(\tilde{g}(x))) \rfloor.
1053 d_f = l1 - 1. rev(r2)[i] = r2[l1 + l2 - 2 - i]. We want
1054 rev(r2)[l1 - 1 ... l1 + l2 - 2], hence
1055 r2[l2 - 1 ... 0] */
1056 for (i = 0; i < l2; i++)
1057 mpz_add (r2[i], r2[i], r1[l2 - 1 - i]);
1058
1059 #if 0
1060 for (i = 0; i < l1 + l2 - 1; i++)
1061 gmp_printf ("list_mul_reciprocal: r2[%lu] = %Zd\n", i, r2[i]);
1062 #endif
1063
1064 mpz_mul_2exp (S1[0], S1[0], 1UL);
1065 mpz_mul_2exp (S2[0], S2[0], 1UL);
1066
1067 for (i = 0; i < l1 + l2 - 1; i++)
1068 mpz_set (R[i], r2[i]);
1069
1070 #if 0
1071 for (i = 0; i < l1 + l2 - 1; i++)
1072 gmp_printf ("list_mul_reciprocal: R[%lu] = %Zd\n", i, R[i]);
1073 #endif
1074 #ifdef WANT_ASSERT
1075 list_recip_eval1 (sum1, R, l1 + l2 - 1);
1076 mpz_mod (sum1, sum1, modulus);
1077 ASSERT (mpz_cmp (prod, sum1) == 0);
1078 mpz_clear (sum1);
1079 mpz_clear (sum2);
1080 mpz_clear (prod);
1081 #endif
1082 }
1083
1084 /*
1085 Computes V_k(S), where the Chebyshev polynomial V_k(X) is defined by
1086 V_k(X + 1/X) = X^k + 1/X^k
1087 */
1088
1089 static void
V(mpres_t R,const mpres_t S,const long k,mpmod_t modulus)1090 V (mpres_t R, const mpres_t S, const long k, mpmod_t modulus)
1091 {
1092 mpres_t V0, Vi, Vi1;
1093 unsigned long j, uk;
1094 int po2;
1095
1096 if (k == 0L)
1097 {
1098 mpres_set_ui (R, 2UL, modulus);
1099 return;
1100 }
1101
1102 uk = labs (k);
1103
1104 if (uk == 1UL)
1105 {
1106 mpres_set (R, S, modulus);
1107 return;
1108 }
1109
1110 for (po2 = 0; uk % 2UL == 0UL; uk >>= 1, po2++);
1111
1112 mpres_init (V0, modulus);
1113 mpres_set_ui (V0, 2UL, modulus); /* V0 = V_0(S) = 2 */
1114
1115 if (uk == 1UL)
1116 {
1117 mpres_set (R, S, modulus);
1118 while (po2-- > 0)
1119 {
1120 mpres_sqr (R, R, modulus);
1121 mpres_sub (R, R, V0, modulus);
1122 }
1123 mpres_clear (V0, modulus);
1124 return;
1125 }
1126
1127 if (0)
1128 {
1129 mpz_t tz;
1130 mpz_init (tz);
1131 mpres_get_z (tz, S, modulus);
1132 gmp_printf ("Chebyshev_V(%ld, Mod(%Zd,N)) == ", k, tz);
1133 mpz_clear (tz);
1134 }
1135
1136 for (j = 1UL; j <= uk / 2UL; j <<= 1);
1137
1138 mpres_init (Vi, modulus);
1139 mpres_init (Vi1, modulus);
1140
1141 /* i = 1. Vi = V_i(S), Vi1 = V_{i+1}(S) */
1142 mpres_set (Vi, S, modulus);
1143 mpres_sqr (Vi1, S, modulus);
1144 mpres_sub (Vi1, Vi1, V0, modulus);
1145 j >>= 1;
1146
1147 while (j > 1)
1148 {
1149 if ((uk & j) != 0UL)
1150 {
1151 /* i' = 2i + 1.
1152 V_{i'} = V_{2i + 1} = V_{i+1 + i} = V_{i+1} * V_{i} - V_1
1153 V_{i'+1} = V_{2i + 2} = {V_{i+1}}^2 - V_0. */
1154 mpres_mul (Vi, Vi, Vi1, modulus);
1155 mpres_sub (Vi, Vi, S, modulus);
1156 mpres_sqr (Vi1, Vi1, modulus);
1157 mpres_sub (Vi1, Vi1, V0, modulus);
1158 }
1159 else
1160 {
1161 /* i' = 2i.
1162 V_{i'} = V_{2i} = {V_i}^2 - V0.
1163 V_{i'+1} = V_{2i + 1} = V_{i+1 + i} = V_{i+1} * V_{i} - V_1 */
1164 mpres_mul (Vi1, Vi, Vi1, modulus);
1165 mpres_sub (Vi1, Vi1, S, modulus);
1166
1167 mpres_sqr (Vi, Vi, modulus);
1168 mpres_sub (Vi, Vi, V0, modulus);
1169 }
1170 j >>= 1;
1171 }
1172
1173 /* Least significant bit of uk is always 1 */
1174 mpres_mul (Vi, Vi, Vi1, modulus);
1175 mpres_sub (Vi, Vi, S, modulus);
1176
1177 while (po2-- > 0)
1178 {
1179 mpres_sqr (Vi, Vi, modulus);
1180 mpres_sub (Vi, Vi, V0, modulus);
1181 }
1182
1183 mpres_set (R, Vi, modulus);
1184
1185 mpres_clear (Vi, modulus);
1186 mpres_clear (Vi1, modulus);
1187 mpres_clear (V0, modulus);
1188
1189 if (0)
1190 {
1191 mpz_t tz;
1192 mpz_init (tz);
1193 mpres_get_z (tz, R, modulus);
1194 gmp_printf ("%Zd\n", tz);
1195 mpz_clear (tz);
1196 }
1197 }
1198
1199 /*
1200 Computes U_k(S), where the Chebyshev polynomial U_k(X) is defined by
1201 U_k(X + 1/X) = (X^k - 1/X^k) / (X - 1/X)
1202 If R1 != NULL, stores U_{k+1}(S) there
1203 */
1204
1205 static void
U(mpres_t R,mpres_t R1,const mpres_t S,const long k,mpmod_t modulus)1206 U (mpres_t R, mpres_t R1, const mpres_t S, const long k, mpmod_t modulus)
1207 {
1208 mpres_t V0, Vi, Vi1, Ui, Ui1, t;
1209 unsigned long j, uk;
1210
1211 if (k == 0L)
1212 {
1213 mpres_set_ui (R, 0UL, modulus); /* U_0 = 0 */
1214 if (R1 != NULL)
1215 mpres_set_ui (R1, 1UL, modulus); /* U_1 = 1 */
1216 return;
1217 }
1218
1219 uk = labs (k);
1220
1221 if (uk == 1UL)
1222 {
1223 mpres_set_ui (R, 1UL, modulus);
1224 if (k == -1)
1225 mpres_neg (R, R, modulus);
1226
1227 if (R1 != NULL)
1228 {
1229 if (k == -1)
1230 mpres_set_ui (R1, 0UL, modulus);
1231 else
1232 mpres_set (R1, S, modulus); /* U_2(S) = S */
1233 }
1234
1235 return;
1236 }
1237
1238 if (0)
1239 {
1240 mpz_t tz;
1241 mpz_init (tz);
1242 mpres_get_z (tz, S, modulus);
1243 gmp_printf ("Chebyshev_U(%ld, Mod(%Zd,N)) == ", k, tz);
1244 mpz_clear (tz);
1245 }
1246
1247 mpres_init (V0, modulus);
1248 mpres_init (Vi, modulus);
1249 mpres_init (Vi1, modulus);
1250 mpres_init (Ui, modulus);
1251 mpres_init (Ui1, modulus);
1252 mpres_init (t, modulus);
1253
1254 for (j = 1UL; j <= uk / 2UL; j <<= 1);
1255
1256 mpres_set_ui (Ui, 1UL, modulus); /* Ui = U_1(S) = 1 */
1257 mpres_set (Ui1, S, modulus); /* Ui1 = U_2(S) = S */
1258 mpres_add (V0, Ui, Ui, modulus); /* V0 = V_0(S) = 2 */
1259 mpres_set (Vi, S, modulus); /* Vi = V_1(S) = S */
1260 mpres_sqr (Vi1, Vi, modulus);
1261 mpres_sub (Vi1, Vi1, V0, modulus); /* Vi1 = V_2(S) = S^2 - 2 */
1262 j >>= 1; /* i = 1 */
1263
1264 while (j != 0)
1265 {
1266 if ((uk & j) == 0UL)
1267 {
1268 mpres_mul (Vi1, Vi1, Vi, modulus);
1269 mpres_sub (Vi1, Vi1, S, modulus); /* V_{2i+1} = V_{i+1} V_i - V_1 */
1270 /* U_{2i+1} = (U_{i+1} + U_i) (U_{i+1} - U_i) */
1271 mpres_sub (t, Ui1, Ui, modulus);
1272 mpres_add (Ui1, Ui1, Ui, modulus);
1273 mpres_mul (Ui1, Ui1, t, modulus);
1274 mpres_mul (Ui, Ui, Vi, modulus); /* U_{2n} = U_n V_n */
1275 mpres_sqr (Vi, Vi, modulus);
1276 mpres_sub (Vi, Vi, V0, modulus); /* V_{2n} = V_n^2 - 2 */
1277 }
1278 else
1279 {
1280 /* U_{2i+1} = (U_{i+1} + U_i) (U_{i+1} - U_i) */
1281 mpres_sub (t, Ui1, Ui, modulus);
1282 mpres_add (Ui, Ui, Ui1, modulus);
1283 mpres_mul (Ui, Ui, t, modulus);
1284 mpres_mul (Ui1, Ui1, Vi1, modulus); /* U_{2n+2} = U_{n+1} V_{n+1} */
1285 mpres_mul (Vi, Vi, Vi1, modulus);
1286 mpres_sub (Vi, Vi, S, modulus); /* V_{2i+1} = V_{i+1} V_i - V_1 */
1287 mpres_sqr (Vi1, Vi1, modulus);
1288 mpres_sub (Vi1, Vi1, V0, modulus); /* V_{2n+2} = V_{n+1}^2 - 2 */
1289 }
1290 j >>= 1;
1291 }
1292
1293 if (k > 0)
1294 mpres_set (R, Ui, modulus);
1295 else
1296 mpres_neg (R, Ui, modulus);
1297
1298 if (R1 != NULL)
1299 {
1300 /* Here k != -1,0,1, so k+1 is negative iff k is */
1301 if (k > 0)
1302 mpres_set (R1, Ui1, modulus);
1303 else
1304 mpres_neg (R1, Ui1, modulus);
1305 }
1306
1307 mpres_clear (V0, modulus);
1308 mpres_clear (Vi, modulus);
1309 mpres_clear (Vi1, modulus);
1310 mpres_clear (Ui, modulus);
1311 mpres_clear (Ui1, modulus);
1312 mpres_clear (t, modulus);
1313
1314 if (0)
1315 {
1316 mpz_t tz;
1317 mpz_init (tz);
1318 mpres_get_z (tz, R, modulus);
1319 gmp_printf ("%Zd\n", tz);
1320 mpz_clear (tz);
1321 }
1322 }
1323
1324
1325 /* Set R[i] = V_{i+k}(Q) * F[i] or U_{i+k}(Q) * F[i], for 0 <= i < len
1326 We compute V_{i+k+1}(Q) by V_{i+k}(Q)*V_1(Q) - V_{i+k-1}(Q).
1327 For U, we compute U_{i+k+1}(Q) by U_{i+k}(Q)*V_1(Q) - U_{i+k-1}(Q).
1328 The values of V_1(Q), V_{k-1}(Q) and V_k(Q) and V_k(Q) are in
1329 V1, Vk_1 and Vk, resp.
1330 The values of Vk_1 and Vk are clobbered. */
1331 static void
scale_by_chebyshev(listz_t R,const listz_t F,const unsigned long len,mpmod_t modulus,const mpres_t V1,mpres_t Vk_1,mpres_t Vk)1332 scale_by_chebyshev (listz_t R, const listz_t F, const unsigned long len,
1333 mpmod_t modulus, const mpres_t V1, mpres_t Vk_1,
1334 mpres_t Vk)
1335 {
1336 mpres_t Vt;
1337 unsigned long i;
1338
1339 mpres_init (Vt, modulus);
1340
1341 for (i = 0; i < len; i++)
1342 {
1343 mpres_mul_z_to_z (R[i], Vk, F[i], modulus);
1344 mpres_mul (Vt, Vk, V1, modulus);
1345 mpres_sub (Vt, Vt, Vk_1, modulus);
1346 mpres_set (Vk_1, Vk, modulus); /* Could be a swap */
1347 mpres_set (Vk, Vt, modulus); /* Could be a swap */
1348 }
1349
1350 mpres_clear (Vt, modulus);
1351 }
1352
1353
1354 /* For a given reciprocal polynomial
1355 F(x) = f_0 + sum_{i=1}^{deg} f_i V_i(x+1/x),
1356 compute F(\gamma x)F(\gamma^{-1} x), with Q = \gamma + 1 / \gamma
1357
1358 If NTT is used, needs 4 * deg + 3 entries in tmp.
1359 If no NTT is used, needs 4 * deg + 2 + (memory use of list_sqr_reciprocal)
1360 */
1361
1362 static void
list_scale_V(listz_t R,const listz_t F,const mpres_t Q,const unsigned long deg,mpmod_t modulus,listz_t tmp,const unsigned long tmplen,mpzspv_t dct,const mpzspm_t ntt_context)1363 list_scale_V (listz_t R, const listz_t F, const mpres_t Q,
1364 const unsigned long deg, mpmod_t modulus, listz_t tmp,
1365 const unsigned long tmplen,
1366 mpzspv_t dct, const mpzspm_t ntt_context)
1367 {
1368 mpres_t Vt;
1369 unsigned long i;
1370 const listz_t G = tmp, H = tmp + 2 * deg + 1, newtmp = tmp + 4 * deg + 2;
1371 const unsigned long newtmplen = tmplen - 4 * deg - 2;
1372 #ifdef WANT_ASSERT
1373 mpz_t leading;
1374 #endif
1375
1376 if (deg == 0)
1377 {
1378 ASSERT(tmplen >= 1);
1379 mpz_mul (tmp[0], F[0], F[0]);
1380 mpz_mod (R[0], tmp[0], modulus->orig_modulus);
1381 return;
1382 }
1383
1384 /* Make sure newtmplen does not underflow */
1385 ASSERT_ALWAYS (tmplen >= 4 * deg + 2);
1386 #ifdef WANT_ASSERT
1387 mpz_init (leading);
1388 mpz_mul (leading, F[deg], F[deg]);
1389 mpz_mod (leading, leading, modulus->orig_modulus);
1390 #endif
1391
1392 /* Generate V_1(Q)/2 ... V_{deg}(Q)/2, multiply by f_i to form coefficients
1393 of G(x). Square the symmetric G(x) polynomial. */
1394
1395 outputf (OUTPUT_TRACE, "list_scale_V: Q=%Zd, deg = %lu\n", Q, deg);
1396 list_output_poly (F, deg + 1, 0, 1, "/* list_scale_V */ F(x) = ", "\n",
1397 OUTPUT_TRACE);
1398
1399 /* Compute G[i] = V_i(Q)/2 * F[i] for i = 0, ..., deg.
1400 For i=0, V_0(Q) = 2, so G[0] = F[0],
1401 which leaves deg entries to process */
1402
1403 mpz_set (G[0], F[0]);
1404
1405 #if defined(_OPENMP)
1406 #pragma omp parallel if (deg > 1000)
1407 #endif
1408 {
1409 const int nr_chunks = omp_get_num_threads();
1410 const int thread_nr = omp_get_thread_num();
1411 mpmod_t modulus_local;
1412 unsigned long l, start_i;
1413 mpres_t Vi, Vi_1;
1414
1415 l = (deg - 1) / nr_chunks + 1; /* l = ceil (deg / nr_chunks) */
1416 start_i = thread_nr * l + 1;
1417 if (start_i <= deg + 1)
1418 l = MIN(l, deg + 1 - start_i);
1419 else
1420 l = 0;
1421
1422 mpmod_init_set (modulus_local, modulus);
1423 mpres_init (Vi_1, modulus_local);
1424 mpres_init (Vi, modulus_local);
1425
1426 V (Vi, Q, start_i, modulus_local);
1427 mpres_div_2exp (Vi, Vi, 1, modulus_local);
1428 V (Vi_1, Q, start_i - 1UL, modulus_local);
1429 mpres_div_2exp (Vi_1, Vi_1, 1, modulus_local);
1430 scale_by_chebyshev (G + start_i, F + start_i, l, modulus_local,
1431 Q, Vi_1, Vi);
1432
1433 mpres_clear (Vi_1, modulus_local);
1434 mpres_clear (Vi, modulus_local);
1435 mpmod_clear (modulus_local);
1436 }
1437
1438
1439 list_output_poly (G, deg + 1, 0, 1, "/* list_scale_V */ G(x) = ", "\n",
1440 OUTPUT_TRACE);
1441
1442 /* Now square the G polynomial in G[0 .. deg], put result in
1443 G[0 .. 2*deg] */
1444
1445 /* Bugfix: ks_multiply() does not like negative coefficients. FIXME */
1446
1447 for (i = 0; i <= deg; i++)
1448 if (mpz_sgn (G[i]) < 0)
1449 {
1450 mpz_add (G[i], G[i], modulus->orig_modulus);
1451 /* FIXME: make sure the absolute size does not "run away" */
1452 if (mpz_sgn (G[i]) < 0)
1453 {
1454 outputf (OUTPUT_ERROR, "list_scale_V: G[%lu] still negative\n", i);
1455 mpz_mod (G[i], G[i], modulus->orig_modulus);
1456 }
1457 }
1458
1459 if (dct != NULL && ntt_context != NULL)
1460 ntt_sqr_reciprocal (G, G, dct, deg + 1, ntt_context);
1461 else
1462 list_sqr_reciprocal (G, G, deg + 1, modulus->orig_modulus,
1463 newtmp, newtmplen);
1464
1465 list_output_poly (G, 2 * deg + 1, 0, 1, "/* list_scale_V */ G(x)^2 == ",
1466 "\n", OUTPUT_TRACE);
1467
1468 /* Compute H[i-1] = U_i(Q)/2 * F[i] for i = 1, ..., deg */
1469
1470 #if defined(_OPENMP)
1471 #pragma omp parallel if (deg > 1000)
1472 #endif
1473 {
1474 const int nr_chunks = omp_get_num_threads();
1475 const int thread_nr = omp_get_thread_num();
1476 mpmod_t modulus_local;
1477 unsigned long l, start_i;
1478 mpres_t Ui, Ui_1;
1479
1480 l = (deg - 1) / nr_chunks + 1; /* l = ceil(deg / nr_chunks) */
1481 start_i = thread_nr * l + 1UL;
1482 if (start_i <= deg + 1)
1483 l = MIN(l, deg + 1 - start_i);
1484 else
1485 l = 0;
1486
1487 mpmod_init_set (modulus_local, modulus);
1488 mpres_init (Ui_1, modulus_local);
1489 mpres_init (Ui, modulus_local);
1490
1491 U (Ui_1, Ui, Q, start_i - 1, modulus_local);
1492 mpres_div_2exp (Ui, Ui, 1, modulus_local);
1493 mpres_div_2exp (Ui_1, Ui_1, 1, modulus_local);
1494
1495 scale_by_chebyshev (H - 1 + start_i, F + start_i, l, modulus_local,
1496 Q, Ui_1, Ui);
1497
1498 mpres_clear (Ui_1, modulus_local);
1499 mpres_clear (Ui, modulus_local);
1500 mpmod_clear (modulus_local);
1501 }
1502
1503
1504 /* Convert H to standard basis */
1505 /* We can do it in-place with H - 1 = H_U. */
1506
1507 for (i = deg; i >= 3; i--)
1508 {
1509 mpz_add (H[i - 3], H[i - 3], H[i - 1]);
1510 if (mpz_cmp (H[i - 3], modulus->orig_modulus) >= 0)
1511 mpz_sub (H[i - 3], H[i - 3], modulus->orig_modulus);
1512 }
1513
1514 /* U_2(X+1/X) = (X^2 - 1/X^2)/(X-1/X) = X+1/X = V_1(X+1/X),
1515 so no addition occures here */
1516 /* if (deg >= 2)
1517 mpz_set (H[1], H[1]); Again, a no-op. */
1518
1519 /* U_1(X+1/X) = 1, so this goes to coefficient of index 0 in std. basis */
1520 /* mpz_set (H[0], H[0]); Another no-op. */
1521
1522 /* Now H[0 ... deg-1] contains the deg coefficients in standard basis
1523 of symmetric H(X) of degree 2*deg-2. */
1524
1525 list_output_poly (H, deg, 0, 1, "/* list_scale_V */ H(x) = ", "\n",
1526 OUTPUT_TRACE);
1527
1528 /* Square the symmetric H polynomial of degree 2*deg-2 (i.e. with deg
1529 coefficents in standard basis in H[0 ... deg-1]) */
1530
1531 /* Bugfix: ks_multiply() does not like negative coefficients. */
1532
1533 for (i = 0; i <= deg; i++)
1534 if (mpz_sgn (H[i]) < 0)
1535 {
1536 mpz_add (H[i], H[i], modulus->orig_modulus);
1537 if (mpz_sgn (H[i]) < 0)
1538 {
1539 outputf (OUTPUT_ERROR, "list_scale_V: H[%lu] still negative\n", i);
1540 mpz_mod (H[i], H[i], modulus->orig_modulus);
1541 }
1542 }
1543
1544 if (dct != NULL && ntt_context != NULL)
1545 ntt_sqr_reciprocal (H, H, dct, deg, ntt_context);
1546 else
1547 list_sqr_reciprocal (H, H, deg, modulus->orig_modulus,
1548 newtmp, newtmplen);
1549
1550 /* Now there are the 2*deg-1 coefficients in standard basis of a
1551 symmetric polynomial of degree 4*deg - 4 in H[0 ... 2*deg-2] */
1552
1553 list_output_poly (H, 2*deg - 1, 0, 1, "/* list_scale_V */ H(x)^2 == ", "\n",
1554 OUTPUT_TRACE);
1555
1556 /* Multiply by Q^2-4 */
1557 mpres_init (Vt, modulus);
1558 mpres_sqr (Vt, Q, modulus);
1559 mpres_sub_ui (Vt, Vt, 4, modulus);
1560
1561 #if defined(_OPENMP)
1562 #pragma omp parallel if (deg > 1000)
1563 {
1564 mpmod_t modulus_local;
1565 long i; /* OpenMP insists on signed loop iteration var :( */
1566
1567 mpmod_init_set (modulus_local, modulus);
1568
1569 #pragma omp for
1570 for (i = 0; (unsigned long) i <= 2 * deg - 2; i++)
1571 mpres_mul_z_to_z (H[i], Vt, H[i], modulus_local);
1572 mpmod_clear (modulus_local);
1573 }
1574 #else
1575 for (i = 0; (unsigned long) i <= 2 * deg - 2; i++)
1576 mpres_mul_z_to_z (H[i], Vt, H[i], modulus);
1577 #endif
1578
1579 list_output_poly (H, 2 * deg - 1, 0, 1, "/* list_scale_V */ "
1580 "H(x)^2*(Q^2-4) == ", "\n", OUTPUT_TRACE);
1581
1582
1583 /* Multiply by (X - 1/X)^2 = X^2 - 2 + 1/X^2 and subtract from G */
1584 ASSERT (newtmplen > 0UL);
1585 if (deg == 1)
1586 {
1587 /* H(X) has degree 2*deg-2 = 0, so H(X) = h_0
1588 H(X) * (X - 1/X)^2 = -2 h_0 + h_0 V_2(Y) */
1589 mpz_mul_2exp (newtmp[0], H[0], 1UL);
1590 mpz_add (G[0], G[0], newtmp[0]); /* G[0] -= -2*H[0] */
1591 mpz_sub (G[2], G[2], H[0]);
1592 }
1593 else if (deg == 2)
1594 {
1595 /* H(X) has degree 2*deg-2 = 2, , so
1596 H(X) = h_0 + h_1 (X+1/X) + h_2 (X^2+1/X^2)
1597
1598 H(X) * (X - 1/X)^2 =
1599 -2*(h_0 - h_2) - h_1 * V_1(Y) + (h_0 - 2*h_2) * V_2(Y) +
1600 h_1 * V_3(Y) + h_2 * V_4(Y)
1601 */
1602 mpz_sub (newtmp[0], H[0], H[2]); /* h_0 - h_2 */
1603 mpz_mul_2exp (newtmp[0], newtmp[0], 1UL); /* 2*(h_0 - h_2) */
1604 mpz_add (G[0], G[0], newtmp[0]); /* G[0] -= -2*(h_0 - h_2) */
1605
1606 mpz_add (G[1], G[1], H[1]); /* G[1] -= -h_1 */
1607 mpz_sub (newtmp[0], newtmp[0], H[0]); /* h_0 - 2*h_2 */
1608 mpz_sub (G[2], G[2], newtmp[0]); /* G[2] -= h_0 - 2*h_2 */
1609 mpz_sub (G[3], G[3], H[1]); /* G[3] -= h_1 */
1610 mpz_sub (G[4], G[4], H[2]); /* G[3] -= h_2 */
1611 }
1612 else
1613 {
1614 /* Let H(X) = h_0 + \sum_{i=1}^{n} h_i V_i(Y), Y = X+1/X. Then
1615 (x - 1/x)^2 H(X) =
1616 -2(h_0 - h_2) +
1617 (- h_1 + h_3) V_1(Y) +
1618 \sum_{i=2}^{n-2} (h_{i-2} - 2h_i + h_{i+2}) V_i(Y) +
1619 (h_{n-3} - 2h_{n-1}) V_{n-1}(Y) +
1620 (h_{n-2} - 2h_n) V_n(Y) +
1621 h_{n-1} V_{n+1}(Y) +
1622 h_n V_{n+2}(Y)
1623
1624 In our case, n = 2 * deg - 2
1625 */
1626 mpz_sub (newtmp[0], H[0], H[2]);
1627 mpz_mul_2exp (newtmp[0], newtmp[0], 1UL); /* t[0] = 2*(h_0 - h_2) */
1628 mpz_add (G[0], G[0], newtmp[0]); /* G[0] -= -2*(h_0 - h_2) */
1629
1630 mpz_add (G[1], G[1], H[1]);
1631 mpz_sub (G[1], G[1], H[3]); /* G[1] -= -h_1 + h_3 */
1632
1633 for (i = 2; i <= 2 * deg - 4; i++)
1634 {
1635 mpz_mul_2exp (newtmp[0], H[i], 1);
1636 mpz_sub (newtmp[0], newtmp[0], H[i - 2]);
1637 mpz_sub (newtmp[0], newtmp[0], H[i + 2]); /* 2h_i-h_{i-2}-h_{i+2} */
1638 mpz_add (G[i], G[i], newtmp[0]); /* G[i] -= -2h_i+h_{i-2}+h_{i+2} */
1639 }
1640
1641 for ( ; i <= 2 * deg - 2; i++)
1642 {
1643 mpz_mul_2exp (newtmp[0], H[i], 1UL);
1644 mpz_sub (newtmp[0], H[i - 2], newtmp[0]); /* h_{n-3} - 2h_{n-1} */
1645 mpz_sub (G[i], G[i], newtmp[0]);
1646 }
1647
1648 mpz_sub (G[i], G[i], H[i - 2]);
1649 mpz_sub (G[i + 1], G[i + 1], H[i - 1]);
1650 }
1651
1652 for (i = 0; i <= 2 * deg; i++)
1653 mpz_mod (R[i], G[i], modulus->orig_modulus);
1654
1655 if (test_verbose (OUTPUT_TRACE))
1656 for (i = 0; i <= 2 * deg; i++)
1657 outputf (OUTPUT_TRACE, "list_scale_V: R[%lu] = %Zd\n", i, R[i]);
1658
1659 #ifdef WANT_ASSERT
1660 mpz_mod (R[2 * deg], R[2 * deg], modulus->orig_modulus);
1661 ASSERT (mpz_cmp (leading, R[2 * deg]) == 0);
1662 mpz_clear (leading);
1663 #endif
1664
1665 mpres_clear (Vt, modulus);
1666 }
1667
1668
1669 #ifdef WANT_ASSERT
1670 /* Check if l is an (anti-)symmetric, possibly monic, polynomial.
1671 Returns -1 if it is (anti-)symmetric, or the smallest index i where
1672 l[i] != l[len - 1 + monic - i])
1673 If anti == 1, the list is checked for symmetry, if it is -1, for
1674 antisymmetry.
1675 This function is used only if assertions are enabled.
1676 */
1677
1678 static long int ATTRIBUTE_UNUSED
list_is_symmetric(listz_t l,unsigned long len,int monic,int anti,mpz_t modulus,mpz_t tmp)1679 list_is_symmetric (listz_t l, unsigned long len, int monic, int anti,
1680 mpz_t modulus, mpz_t tmp)
1681 {
1682 unsigned long i;
1683
1684 ASSERT (monic == 0 || monic == 1);
1685 ASSERT (anti == 1 || anti == -1);
1686
1687 if (monic && anti == 1 && mpz_cmp_ui (l[0], 1) != 0)
1688 return 0L;
1689
1690 if (monic && anti == -1)
1691 {
1692 mpz_sub_ui (tmp, modulus, 1);
1693 if (mpz_cmp (tmp, l[0]) != 0)
1694 return 0L;
1695 }
1696
1697 for (i = monic; i < len / 2; i++)
1698 {
1699 if (anti == -1)
1700 {
1701 /* Negate (mod modulus) */
1702 if (mpz_sgn (l[i]) == 0)
1703 {
1704 if (mpz_sgn (l[len - 1 + monic - i]) != 0)
1705 return (long) i;
1706 }
1707 else
1708 {
1709 mpz_sub (tmp, modulus, l[i]);
1710 if (mpz_cmp (tmp, l[len - 1 + monic - i]) != 0)
1711 return (long) i;
1712 }
1713 }
1714 else if (mpz_cmp (l[i], l[len - 1 + monic - i]) != 0)
1715 return (long) i;
1716 }
1717
1718 return -1L;
1719 }
1720 #endif
1721
1722 #if 0 && defined(WANT_ASSERT)
1723 /* Evaluate a polynomial of degree n-1 with all coefficients given in F[],
1724 or of degree n with an implicit leading 1 monomial not stored in F[],
1725 at x modulo modulus. Result goes in r. tmp needs 2 entries. */
1726
1727 static void
1728 list_eval_poly (mpz_t r, const listz_t F, const mpz_t x,
1729 const unsigned long n, const int monic, const mpz_t modulus,
1730 listz_t tmp)
1731 {
1732 unsigned long i;
1733
1734 mpz_set_ui (tmp[0], 1UL);
1735 mpz_set_ui (r, 0UL);
1736
1737 for (i = 0UL; i < n; i++)
1738 {
1739 /* tmp[0] = x^i */
1740 mpz_mul (tmp[1], F[i], tmp[0]);
1741 mpz_mod (tmp[1], tmp[1], modulus);
1742 mpz_add (r, r, tmp[1]);
1743
1744 mpz_mul (tmp[1], tmp[0], x);
1745 mpz_mod (tmp[0], tmp[1], modulus);
1746 }
1747
1748 if (monic)
1749 mpz_add (r, r, tmp[0]);
1750
1751 mpz_mod (r, r, modulus);
1752 }
1753 #endif
1754
1755 /* return the memory needed in the F[] array for poly_from_sets_V */
1756 static unsigned long
mem_poly_from_sets_V(sets_long_t * sets)1757 mem_poly_from_sets_V (sets_long_t *sets)
1758 {
1759 unsigned long c, deg, i, nr;
1760 set_long_t *set = sets->sets;
1761 unsigned long mem, maxmem = 0;
1762
1763 deg = 1UL;
1764 for (nr = sets->nr - 1UL; nr > 0UL; nr--)
1765 {
1766 set = sets_nextset (sets->sets); /* Skip first set */
1767 for (i = 1UL; i < nr; i++) /* Skip over remaining sets but one */
1768 set = sets_nextset (set);
1769 c = set->card;
1770 if (c == 2UL)
1771 deg *= 2UL;
1772 else
1773 {
1774 i = (c - 1UL) / 2UL - 1; /* maximal value of i */
1775 /* list_scale_V needs 2*deg+1 entries starting at
1776 F + (2UL * i + 1UL) * (deg + 1UL) */
1777 mem = (2UL * i + 1UL) * (deg + 1UL) + (2 * deg + 1UL);
1778 if (mem > maxmem)
1779 maxmem = mem;
1780 deg *= c;
1781 }
1782 }
1783
1784 return maxmem;
1785 }
1786
1787 /* Build a polynomial with roots r^2i, i in the sumset of the sets in "sets".
1788 The parameter Q = r + 1/r. This code uses the fact that the polynomials
1789 are symmetric. Requires that the first set in "sets" has cardinality 2,
1790 all sets must be symmetric around 0. The resulting polynomial of degree
1791 2*d is F(x) = f_0 + \sum_{1 <= i <= d} f_i (x^i + 1/x^i). The coefficient
1792 f_i is stored in F[i], which therefore needs d+1 elements. */
1793
1794 static unsigned long
poly_from_sets_V(listz_t F,const mpres_t Q,sets_long_t * sets,listz_t tmp,const unsigned long tmplen,mpmod_t modulus,mpzspv_t dct,const mpzspm_t ntt_context)1795 poly_from_sets_V (listz_t F, const mpres_t Q, sets_long_t *sets,
1796 listz_t tmp, const unsigned long tmplen, mpmod_t modulus,
1797 mpzspv_t dct, const mpzspm_t ntt_context)
1798 {
1799 unsigned long c, deg, i, nr;
1800 set_long_t *set = sets->sets;
1801 mpres_t Qt;
1802
1803 ASSERT_ALWAYS (sets->nr > 0UL);
1804 ASSERT_ALWAYS (set->card == 2UL); /* Check that the cardinality of
1805 first set is 2 */
1806 /* Check that first set is symmetric around 0 (we write card-1
1807 instead of 1 to avoid a compiler warning with clang 2.9) */
1808 ASSERT_ALWAYS (set->elem[0] == -set->elem[set->card - 1]);
1809
1810 if (test_verbose (OUTPUT_TRACE))
1811 {
1812 mpz_t t;
1813 mpz_init (t);
1814 mpres_get_z (t, Q, modulus);
1815 outputf (OUTPUT_TRACE, "poly_from_sets_V (F, Q = %Zd, sets)\n", t);
1816 mpz_clear (t);
1817 }
1818
1819 mpres_init (Qt, modulus);
1820
1821 outputf (OUTPUT_DEVVERBOSE, " (processing set of size 2");
1822
1823 V (Qt, Q, set->elem[0], modulus); /* First set in sets is {-k, k} */
1824 V (Qt, Qt, 2UL, modulus); /* Qt = V_2k(Q) */
1825
1826 mpres_neg (Qt, Qt, modulus);
1827 mpres_get_z (F[0], Qt, modulus);
1828 mpz_set_ui (F[1], 1UL);
1829 deg = 1UL;
1830 /* Here, F(x) = (x - r^{2k_1})(x - r^{-2k_1}) / x =
1831 (x^2 - x (r^{2k_1} + r^{-2k_1}) + 1) / x =
1832 (x + 1/x) - V_{2k_1}(r + 1/r) */
1833
1834 for (nr = sets->nr - 1UL; nr > 0UL; nr--)
1835 {
1836 /* Assuming the sets are sorted in order of ascending cardinality,
1837 we process them back-to-front so the sets of cardinality 2 are
1838 processed last, but skipping the first set which we processed
1839 already. */
1840
1841 set = sets_nextset (sets->sets); /* Skip first set */
1842 for (i = 1UL; i < nr; i++) /* Skip over remaining sets but one */
1843 set = sets_nextset (set);
1844
1845 /* Process this set. We assume it is either of cardinality 2, or of
1846 odd cardinality */
1847 c = set->card;
1848 outputf (OUTPUT_DEVVERBOSE, " %lu", c);
1849
1850 if (c == 2UL)
1851 {
1852 /* Check it's symmetric (we write c-1 instead of 2 to avoid a
1853 compiler warning with clang 2.9) */
1854 ASSERT_ALWAYS (set->elem[0] == -set->elem[c - 1]);
1855 V (Qt, Q, set->elem[0], modulus);
1856 V (Qt, Qt, 2UL, modulus);
1857 list_scale_V (F, F, Qt, deg, modulus, tmp, tmplen, dct,
1858 ntt_context);
1859 deg *= 2UL;
1860 ASSERT_ALWAYS (mpz_cmp_ui (F[deg], 1UL) == 0); /* Check it's monic */
1861 }
1862 else
1863 {
1864 ASSERT_ALWAYS (c % 2UL == 1UL);
1865 ASSERT_ALWAYS (set->elem[(c - 1UL) / 2UL] == 0UL);
1866 /* Generate the F(Q^{2k_i} * X)*F(Q^{-2k_i} * X) polynomials.
1867 Each is symmetric of degree 2*deg, so each has deg+1 coeffients
1868 in standard basis. */
1869 for (i = 0UL; i < (c - 1UL) / 2UL; i++)
1870 {
1871 /* Check it's symmetric */
1872 ASSERT_ALWAYS (set->elem[i] == -set->elem[c - 1L - i]);
1873 V (Qt, Q, set->elem[i], modulus);
1874 V (Qt, Qt, 2UL, modulus);
1875 ASSERT (mpz_cmp_ui (F[deg], 1UL) == 0); /* Check it's monic */
1876 list_scale_V (F + (2UL * i + 1UL) * (deg + 1UL), F, Qt, deg,
1877 modulus, tmp, tmplen, dct, ntt_context);
1878 ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * (deg + 1UL) + 2UL * deg],
1879 1UL) == 0); /* Check it's monic */
1880 }
1881 /* Multiply the polynomials */
1882 for (i = 0UL; i < (c - 1UL) / 2UL; i++)
1883 {
1884 /* So far, we have the product
1885 F(X) * F(Q^{2k_j} * X) * F(Q^{-2k_j} * X), 1 <= j <= i,
1886 at F. This product has degree 2 * deg + i * 4 * deg, that is
1887 (2 * i + 1) * 2 * deg, which means (2 * i + 1) * deg + 1
1888 coefficients in F[0 ... (i * 2 + 1) * deg]. */
1889 ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * deg], 1UL) == 0);
1890 ASSERT (mpz_cmp_ui (F[(2UL * i + 1UL) * (deg + 1UL) + 2UL*deg],
1891 1UL) == 0);
1892 list_output_poly (F, (2UL * i + 1UL) * deg + 1, 0, 1,
1893 "poly_from_sets_V: Multiplying ", "\n",
1894 OUTPUT_TRACE);
1895 list_output_poly (F + (2UL * i + 1UL) * (deg + 1UL),
1896 2UL * deg + 1UL, 0, 1, " and ", "\n",
1897 OUTPUT_TRACE);
1898 list_mul_reciprocal (F,
1899 F, (2UL * i + 1UL) * deg + 1UL,
1900 F + (2UL * i + 1UL) * (deg + 1UL),
1901 2UL * deg + 1UL, modulus->orig_modulus,
1902 tmp, tmplen);
1903 list_mod (F, F, (2UL * i + 3UL) * deg + 1UL,
1904 modulus->orig_modulus);
1905 list_output_poly (F, (2UL * i + 3UL) * deg + 1UL, 0, 1,
1906 " = ", "\n", OUTPUT_TRACE);
1907 ASSERT (mpz_cmp_ui (F[(2UL * i + 3UL) * deg], 1UL) == 0);
1908 }
1909 deg *= c;
1910 }
1911 }
1912
1913 mpres_clear (Qt, modulus);
1914 outputf (OUTPUT_DEVVERBOSE, ")");
1915
1916 return deg;
1917 }
1918
1919 static int
build_F_ntt(listz_t F,const mpres_t P_1,sets_long_t * S_1,const faststage2_param_t * params,mpmod_t modulus)1920 build_F_ntt (listz_t F, const mpres_t P_1, sets_long_t *S_1,
1921 const faststage2_param_t *params, mpmod_t modulus)
1922 {
1923 mpzspm_t F_ntt_context;
1924 mpzspv_t F_ntt;
1925 unsigned long tmplen;
1926 listz_t tmp;
1927 long timestart, realstart;
1928 unsigned long i;
1929
1930 timestart = cputime ();
1931 realstart = realtime ();
1932
1933 /* Precompute the small primes, primitive roots and inverses etc. for
1934 the NTT. The code to multiply wants a 3*k-th root of unity, where
1935 k is the smallest power of 2 with k > s_1/2 */
1936
1937 F_ntt_context = mpzspm_init (3UL << ceil_log2 (params->s_1 / 2 + 1),
1938 modulus->orig_modulus);
1939 if (F_ntt_context == NULL)
1940 {
1941 outputf (OUTPUT_ERROR, "Could not initialise F_ntt_context, "
1942 "presumably out of memory\n");
1943 return ECM_ERROR;
1944 }
1945
1946 print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for building F = ",
1947 F_ntt_context);
1948
1949 outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");
1950
1951 tmplen = params->s_1 + 100;
1952 tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
1953 F_ntt = mpzspv_init (1UL << ceil_log2 (params->s_1 / 2 + 1), F_ntt_context);
1954
1955 i = poly_from_sets_V (F, P_1, S_1, tmp, tmplen, modulus, F_ntt,
1956 F_ntt_context);
1957 ASSERT_ALWAYS(2 * i == params->s_1);
1958 ASSERT_ALWAYS(mpz_cmp_ui (F[i], 1UL) == 0);
1959
1960 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
1961 if (test_verbose (OUTPUT_TRACE))
1962 {
1963 for (i = 0; i < params->s_1 / 2 + 1; i++)
1964 outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
1965 outputf (OUTPUT_TRACE, "f(x) = f_0");
1966 for (i = 1; i < params->s_1 / 2 + 1; i++)
1967 outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
1968 outputf (OUTPUT_TRACE, "/* PARI */ \n");
1969 }
1970
1971 clear_list (tmp, tmplen);
1972 tmp = NULL;
1973 mpzspv_clear (F_ntt, F_ntt_context);
1974 F_ntt = NULL;
1975 mpzspm_clear (F_ntt_context);
1976 F_ntt_context = NULL;
1977
1978 return 0;
1979 }
1980
1981 /* Compute g_i = x_0^{M-i} * r^{(M-i)^2} for 0 <= i < l.
1982 x_0 = b_1^{2*k_2 + (2*m_1 + 1) * P}. r = b_1^P.
1983 Stores the result in g[0 ... l] and/or in g_ntt[offset ... offset + l] */
1984
1985 static void
pm1_sequence_g(listz_t g_mpz,mpzspv_t g_ntt,const mpres_t b_1,const unsigned long P,const long M_param,const unsigned long l_param,const mpz_t m_1,const long k_2,mpmod_t modulus_param,const mpzspm_t ntt_context)1986 pm1_sequence_g (listz_t g_mpz, mpzspv_t g_ntt, const mpres_t b_1,
1987 const unsigned long P, const long M_param,
1988 const unsigned long l_param, const mpz_t m_1, const long k_2,
1989 mpmod_t modulus_param, const mpzspm_t ntt_context)
1990 {
1991 mpres_t r[3], x_0, x_Mi;
1992 mpz_t t;
1993 unsigned long i;
1994 long timestart, realstart;
1995 long M = M_param;
1996 unsigned long l = l_param, offset = 0UL;
1997 mpmod_t modulus;
1998 int want_output = 1;
1999
2000 outputf (OUTPUT_VERBOSE, "Computing g_i");
2001 outputf (OUTPUT_DEVVERBOSE, "\npm1_sequence_g: P = %lu, M_param = %lu, "
2002 "l_param = %lu, m_1 = %Zd, k_2 = %lu\n",
2003 P, M_param, l_param, m_1, k_2);
2004 timestart = cputime ();
2005 realstart = realtime ();
2006
2007 #ifdef _OPENMP
2008 #pragma omp parallel if (l > 100) private(r, x_0, x_Mi, t, i, M, l, offset, modulus, want_output)
2009 {
2010 /* When multi-threading, we adjust the parameters for each thread */
2011
2012 const int nr_chunks = omp_get_num_threads();
2013 const int thread_nr = omp_get_thread_num();
2014
2015 l = (l_param - 1) / nr_chunks + 1; /* = ceil(l_param / nr_chunks) */
2016 offset = thread_nr * l;
2017 if (offset <= l_param)
2018 l = MIN(l, l_param - offset);
2019 else
2020 l = 0;
2021 outputf (OUTPUT_DEVVERBOSE,
2022 "pm1_sequence_g: thread %d has l = %lu, offset = %lu.\n",
2023 thread_nr, l, offset);
2024 M = M_param - (long) offset;
2025
2026 /* Let only the master thread print stuff */
2027 want_output = (thread_nr == 0);
2028
2029 if (want_output)
2030 outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
2031 #endif
2032
2033 /* Make a private copy of the mpmod_t struct */
2034 mpmod_init_set (modulus, modulus_param);
2035
2036 mpz_init (t);
2037 mpres_init (r[0], modulus);
2038 mpres_init (r[1], modulus);
2039 mpres_init (r[2], modulus);
2040 mpres_init (x_0, modulus);
2041 mpres_init (x_Mi, modulus);
2042
2043 if (want_output)
2044 {
2045 if (test_verbose (OUTPUT_TRACE))
2046 {
2047 mpres_get_z (t, b_1, modulus);
2048 outputf (OUTPUT_TRACE, "\n/* pm1_sequence_g */ N = %Zd; "
2049 "b_1 = Mod(%Zd, N); /* PARI */\n", modulus->orig_modulus, t);
2050 outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ P = %lu; M = %ld; "
2051 "m_1 = %Zd; /* PARI */\n", P, M, m_1);
2052 outputf (OUTPUT_TRACE,
2053 "/* pm1_sequence_g */ r = b_1^P; /* PARI */\n");
2054 outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ x_0 = "
2055 "b_1^(2*%ld + (2*m_1 + 1)*P); /* PARI */\n", k_2);
2056 }
2057 }
2058
2059 /* We use (M-(i+1))^2 = (M-i)^2 + 2*(-M+i) + 1 */
2060 mpz_set_ui (t, P);
2061 mpres_pow (r[0], b_1, t, modulus); /* r[0] = b_1^P = r */
2062 if (test_verbose (OUTPUT_TRACE))
2063 {
2064 mpres_get_z (t, r[0], modulus);
2065 outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ r == %Zd /* PARI C */\n", t);
2066 }
2067
2068 /* FIXME: This is a huge mess, clean up some time */
2069
2070 mpz_set_si (t, M);
2071 mpz_neg (t, t);
2072 mpz_mul_2exp (t, t, 1UL);
2073 mpz_add_ui (t, t, 1UL);
2074 /* Warning: t might be negative here. */
2075 mpres_pow (r[1], r[0], t, modulus); /* r[1] = r^{2(-M+i)+1}, i = 0 */
2076 mpz_set_si (t, M);
2077 mpz_mul (t, t, t); /* t = M^2 */
2078 mpres_pow (r[2], r[0], t, modulus); /* r[2] = r^{(M-i)^2}, i = 0 */
2079 mpres_sqr (r[0], r[0], modulus); /* r[0] = r^2 */
2080
2081 mpz_mul_2exp (t, m_1, 1UL);
2082 mpz_add_ui (t, t, 1UL);
2083 mpz_mul_ui (t, t, P);
2084 mpz_add_si (t, t, k_2);
2085 mpz_add_si (t, t, k_2);
2086 if (want_output)
2087 outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ 2*%ld + (2*%Zd + 1)*P == "
2088 "%Zd /* PARI C */\n", k_2, m_1, t);
2089
2090 /* Warning: t might be negative here. */
2091 mpres_pow (x_0, b_1, t, modulus); /* x_0 = b_1^{2*k_2 + (2*m_1 + 1)*P} */
2092 if (want_output && test_verbose (OUTPUT_TRACE))
2093 {
2094 mpres_get_z (t, x_0, modulus);
2095 outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ x_0 == %Zd /* PARI C */\n",
2096 t);
2097 }
2098
2099 mpz_set_si (t, M);
2100 mpres_pow (x_Mi, x_0, t, modulus); /* x_Mi = x_0^{M-i}, i = 0 */
2101
2102 mpres_invert (x_0, x_0, modulus); /* x_0 := x_0^{-1} now */
2103 mpres_mul (r[1], r[1], x_0, modulus); /* r[1] = x_0^{-1} * r^{-2M+1} */
2104
2105 mpres_mul (r[2], r[2], x_Mi, modulus); /* r[2] = x_0^M * r^{M^2} */
2106 mpres_get_z (t, r[2], modulus);
2107 outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd; /* PARI */\n",
2108 offset, t);
2109 if (l > 0)
2110 {
2111 if (g_mpz != NULL)
2112 mpz_set (g_mpz[offset], t);
2113 if (g_ntt != NULL)
2114 mpzspv_from_mpzv (g_ntt, offset, &t, 1UL, ntt_context);
2115 }
2116
2117 /* So here we have for i = 0
2118 r[2] = x_0^(M-i) * r^{(M-i)^2}
2119 r[1] = x_0^{-1} * r^{2(-M+i)+1}
2120 r[0] = r^2
2121 t = r[2]
2122 */
2123
2124 for (i = 1; i < l; i++)
2125 {
2126 if (g_mpz != NULL)
2127 {
2128 mpres_mul_z_to_z (g_mpz[offset + i], r[1], g_mpz[offset + i - 1],
2129 modulus);
2130 outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd;"
2131 " /* PARI */\n", offset + i, g_mpz[offset + i]);
2132 }
2133 if (g_ntt != NULL)
2134 {
2135 mpres_mul_z_to_z (t, r[1], t, modulus);
2136 if (g_mpz == NULL) /* Only one should be non-NULL... */
2137 outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu = %Zd;"
2138 " /* PARI */\n", offset + i, t);
2139 mpzspv_from_mpzv (g_ntt, offset + i, &t, 1UL, ntt_context);
2140 }
2141 mpres_mul (r[1], r[1], r[0], modulus);
2142 }
2143
2144 mpres_clear (r[0], modulus);
2145 mpres_clear (r[1], modulus);
2146 mpres_clear (r[2], modulus);
2147 mpres_clear (x_0, modulus);
2148 mpres_clear (x_Mi, modulus);
2149 mpz_clear (t);
2150 mpmod_clear (modulus); /* Clear our private copy of modulus */
2151
2152 #ifdef _OPENMP
2153 }
2154 #endif
2155
2156 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
2157
2158 if (test_verbose (OUTPUT_TRACE))
2159 {
2160 for (i = 0; i < l_param; i++)
2161 {
2162 outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g_%lu == x_0^"
2163 "(M - %lu) * r^((M - %lu)^2) /* PARI C */\n", i, i, i);
2164 }
2165 outputf (OUTPUT_TRACE, "/* pm1_sequence_g */ g(x) = g_0");
2166 for (i = 1; i < l; i++)
2167 outputf (OUTPUT_TRACE, " + g_%lu * x^%lu", i, i);
2168 outputf (OUTPUT_TRACE, " /* PARI */\n");
2169 }
2170 }
2171
2172
2173 /* Compute h_j = r^(-j^2) * f_j for 0 <= j < d as described in section 9
2174 of the paper. h == f is ok. */
2175
2176 static void
pm1_sequence_h(listz_t h,mpzspv_t h_ntt,mpz_t * f,const mpres_t r,const unsigned long d,mpmod_t modulus_parm,const mpzspm_t ntt_context)2177 pm1_sequence_h (listz_t h, mpzspv_t h_ntt, mpz_t *f, const mpres_t r,
2178 const unsigned long d, mpmod_t modulus_parm,
2179 const mpzspm_t ntt_context)
2180 {
2181 mpres_t invr; /* r^{-1}. Can be shared between threads */
2182 long timestart, realstart;
2183
2184 mpres_init (invr, modulus_parm);
2185 mpres_invert (invr, r, modulus_parm); /* invr = r^{-1}. FIXME: test for
2186 failure, even if theoretically
2187 impossible */
2188
2189 if (test_verbose (OUTPUT_TRACE))
2190 {
2191 mpz_t t;
2192 mpz_init (t);
2193 mpres_get_z (t, r, modulus_parm);
2194 outputf (OUTPUT_TRACE, "\n/* pm1_sequence_h */ N = %Zd; "
2195 "r = Mod(%Zd, N); /* PARI */\n",
2196 modulus_parm->orig_modulus, t);
2197 mpz_clear (t);
2198 }
2199
2200 outputf (OUTPUT_VERBOSE, "Computing h");
2201 timestart = cputime ();
2202 realstart = realtime ();
2203
2204 #ifdef _OPENMP
2205 #pragma omp parallel if (d > 100)
2206 #endif
2207 {
2208 mpres_t fd[3]; /* finite differences table for r^{-i^2}*/
2209 mpz_t t; /* the h_j value as an mpz_t */
2210 unsigned long j;
2211 unsigned long offset = 0UL, len = d;
2212 mpmod_t modulus;
2213
2214 /* Adjust offset and length for this thread */
2215 #ifdef _OPENMP
2216 {
2217 const int nr_chunks = omp_get_num_threads();
2218 const int thread_nr = omp_get_thread_num();
2219 unsigned long chunklen;
2220
2221 if (thread_nr == 0)
2222 outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
2223
2224 chunklen = (len - 1UL) / (unsigned long) nr_chunks + 1UL;
2225 offset = chunklen * (unsigned long) thread_nr;
2226 if (offset <= len)
2227 len = MIN(chunklen, len - offset);
2228 else
2229 len = 0;
2230 }
2231 #endif
2232
2233 mpmod_init_set (modulus, modulus_parm);
2234 mpres_init (fd[0], modulus);
2235 mpres_init (fd[1], modulus);
2236 mpres_init (fd[2], modulus);
2237 mpz_init (t);
2238
2239 /* We have (n + 1)^2 = n^2 + 2n + 1. For the finite differences we'll
2240 need r^{-2}, r^{-(2n+1)}, r^{-n^2}. Init for n = 0. */
2241
2242 /* r^{-2} in fd[0] is constant and could be shared. Computing it
2243 separately in each thread has the advantage of putting it in
2244 local memory. May not make much difference overall */
2245
2246 mpres_sqr (fd[0], invr, modulus); /* fd[0] = r^{-2} */
2247 mpz_set_ui (t, offset);
2248 mpz_mul_2exp (t, t, 1UL);
2249 mpz_add_ui (t, t, 1UL); /* t = 2 * offset + 1 */
2250 mpres_pow (fd[1], invr, t, modulus); /* fd[1] = r^{-(2*offset+1)} */
2251 mpz_set_ui (t, offset);
2252 mpz_mul (t, t, t); /* t = offset^2 */
2253 mpres_pow (fd[2], invr, t, modulus); /* fd[2] = r^{-offset^2} */
2254
2255 /* Generate the sequence */
2256 for (j = offset; j < offset + len; j++)
2257 {
2258 mpres_mul_z_to_z (t, fd[2], f[j], modulus);
2259 outputf (OUTPUT_TRACE,
2260 "/* pm1_sequence_h */ h_%lu = %Zd; /* PARI */\n", j, t);
2261
2262 if (h != NULL)
2263 mpz_set (h[j], t);
2264 if (h_ntt != NULL)
2265 mpzspv_from_mpzv (h_ntt, j, &t, 1UL, ntt_context);
2266
2267 mpres_mul (fd[2], fd[2], fd[1], modulus); /* fd[2] = r^{-j^2} */
2268 mpres_mul (fd[1], fd[1], fd[0], modulus); /* fd[1] = r^{-2*j-1} */
2269 }
2270
2271 mpres_clear (fd[2], modulus);
2272 mpres_clear (fd[1], modulus);
2273 mpres_clear (fd[0], modulus);
2274 mpz_clear (t);
2275 mpmod_clear (modulus);
2276 }
2277
2278 mpres_clear (invr, modulus_parm);
2279
2280 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
2281
2282 if (test_verbose (OUTPUT_TRACE))
2283 {
2284 unsigned long j;
2285 for (j = 0; j < d; j++)
2286 outputf (OUTPUT_TRACE, "/* pm1_sequence_h */ h_%lu == "
2287 "f_%lu * r^(-%lu^2) /* PARI C */\n", j, j, j);
2288 outputf (OUTPUT_TRACE, "/* pm1_sequence_h */ h(x) = h_0");
2289 for (j = 1; j < d; j++)
2290 outputf (OUTPUT_TRACE, " + h_%lu * (x^%lu + x^(-%lu))", j, j, j);
2291 outputf (OUTPUT_TRACE, " /* PARI */\n");
2292 }
2293 }
2294
2295
2296 static int
make_S_1_S_2(sets_long_t ** S_1,set_long_t ** S_2,const faststage2_param_t * params)2297 make_S_1_S_2 (sets_long_t **S_1, set_long_t **S_2,
2298 const faststage2_param_t *params)
2299 {
2300 unsigned long i;
2301 sets_long_t *facS_2;
2302 size_t facS_2_size;
2303
2304 *S_1 = sets_get_factored_sorted (params->P);
2305 if (*S_1 == NULL)
2306 return ECM_ERROR;
2307
2308 {
2309 mpz_t t1, t2;
2310
2311 mpz_init (t1);
2312 mpz_init (t2);
2313 sets_sumset_minmax (t1, *S_1, 1);
2314 sets_max (t2, params->P);
2315 ASSERT_ALWAYS (mpz_cmp (t1, t2) == 0);
2316 mpz_clear (t1);
2317 mpz_clear (t2);
2318 }
2319
2320 *S_2 = malloc (set_sizeof(params->s_2));
2321 if (*S_2 == NULL)
2322 {
2323 free (*S_1);
2324 return ECM_ERROR;
2325 }
2326
2327 /* Extract sets for S_2 and compute the set of sums */
2328
2329 sets_extract (NULL, &facS_2_size, *S_1, params->s_2);
2330 facS_2 = malloc (facS_2_size);
2331 if (facS_2 == NULL)
2332 {
2333 free (*S_1);
2334 free (*S_2);
2335 return ECM_ERROR;
2336 }
2337 sets_extract (facS_2, NULL, *S_1, params->s_2);
2338 sets_sumset (*S_2, facS_2);
2339 ASSERT_ALWAYS ((*S_2)->card == params->s_2);
2340 free (facS_2);
2341 quicksort_long ((*S_2)->elem, (*S_2)->card);
2342
2343 /* Print the sets in devverbose mode */
2344 if (test_verbose (OUTPUT_DEVVERBOSE))
2345 {
2346 outputf (OUTPUT_DEVVERBOSE, "S_1 = ");
2347 sets_print (OUTPUT_DEVVERBOSE, *S_1);
2348
2349 outputf (OUTPUT_DEVVERBOSE, "S_2 = {");
2350 for (i = 0UL; i + 1UL < params->s_2; i++)
2351 outputf (OUTPUT_DEVVERBOSE, "%ld, ", (*S_2)->elem[i]);
2352 if (i < params->s_2)
2353 outputf (OUTPUT_DEVVERBOSE, "%ld", (*S_2)->elem[i]);
2354 outputf (OUTPUT_DEVVERBOSE, "}\n");
2355 }
2356
2357 return 0;
2358 }
2359
2360 #if 0
2361 static mpzspv_t *
2362 mpzspv_init_mt (spv_size_t len, mpzspm_t mpzspm)
2363 {
2364 int i; /* OpenMP wants the iteration variable a signed type */
2365 mpzspv_t *x = (mpzspv_t *) malloc (mpzspm->sp_num * sizeof (spv_t *));
2366
2367 if (x == NULL)
2368 return NULL;
2369
2370 for (i = 0; i < (int) mpzspm->sp_num; i++)
2371 x[i] = NULL;
2372
2373 #ifdef _OPENMP
2374 #pragma omp parallel private(i) shared(x)
2375 {
2376 #pragma omp for
2377 #endif
2378 for (i = 0; i < (int) mpzspm->sp_num; i++)
2379 x[i] = (spv_t *) sp_aligned_malloc (len * sizeof (sp_t));
2380
2381 #ifdef _OPENMP
2382 }
2383 #endif
2384
2385 for (i = 0; i < (int) mpzspm->sp_num; i++)
2386 if (x[i] == NULL)
2387 break;
2388
2389 if (i != (int) mpzspm->sp_num) /* There is a NULL pointer */
2390 {
2391 for (i = 0; i < (int) mpzspm->sp_num; i++)
2392 if (x[i] != NULL)
2393 sp_aligned_free(x[i]);
2394 return NULL;
2395 }
2396
2397 #if 0
2398 if (test_verbose (OUTPUT_DEVVERBOSE))
2399 {
2400 spv_t * last = x[0];
2401 printf ("mpzspv_init_mt: x[0] = %p\n", x[0]);
2402 for (i = 1; i < (int) mpzspm->sp_num; i++)
2403 printf ("mpzspv_init_mt: x[%d] = %p, distance = %ld\n",
2404 i, x[i], (long) (x[i] - x[i-1]));
2405 }
2406 #endif
2407
2408 return x;
2409 }
2410 #endif
2411
2412 /* Square the reciprocal Laurent polynomial S(x) of degree 2*n-2.
2413 S(x) = s_0 + \sum_{i=1}^{n-1} s_i (x^i + x^{-1}).
2414 S[i] contains the n coefficients s_i, 0 <= i <= n-1.
2415 R[i] will contain the 2n-1 coefficients r_i, 0 <= i <= 2*n-2, where
2416 R(x) = S(x)^2 = r_0 + \sum_{i=1}^{2n-2} r_i (x^i + x^{-1}).
2417 dft must have power of 2 length len >= 2n.
2418 The NTT primes must be == 1 (mod 3*len).
2419 */
2420
2421 #undef TRACE_ntt_sqr_reciprocal
2422 static void
ntt_sqr_reciprocal(mpzv_t R,const mpzv_t S,mpzspv_t dft,const spv_size_t n,const mpzspm_t ntt_context)2423 ntt_sqr_reciprocal (mpzv_t R, const mpzv_t S, mpzspv_t dft,
2424 const spv_size_t n, const mpzspm_t ntt_context)
2425 {
2426 #ifdef WANT_ASSERT
2427 mpz_t S_eval_1, R_eval_1;
2428 #endif
2429
2430 if (n == 0)
2431 return;
2432
2433 if (n == 1)
2434 {
2435 mpz_mul (R[0], S[0], S[0]);
2436 mpz_mod (R[0], R[0], ntt_context->modulus);
2437 return;
2438 }
2439
2440 #ifdef WANT_ASSERT
2441 mpz_init (S_eval_1);
2442 list_recip_eval1 (S_eval_1, S, n);
2443 /* Compute (S(1))^2 */
2444 mpz_mul (S_eval_1, S_eval_1, S_eval_1);
2445 mpz_mod (S_eval_1, S_eval_1, ntt_context->modulus);
2446 #endif
2447
2448 #ifdef TRACE_ntt_sqr_reciprocal
2449 printf ("ntt_sqr_reciprocal: n %lu, length %lu\n", n, len);
2450 gmp_printf ("Input polynomial is %Zd", S[0]);
2451 {
2452 int j;
2453 for (j = 1; (spv_size_t) j < n; j++)
2454 gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", S[j], j, j);
2455 }
2456 printf ("\n");
2457 #endif
2458
2459 /* Fill NTT elements [0 .. n-1] with coefficients */
2460 mpzspv_from_mpzv (dft, (spv_size_t) 0, S, n, ntt_context);
2461 mpzspv_sqr_reciprocal (dft, n, ntt_context);
2462
2463 #if defined(_OPENMP)
2464 #pragma omp parallel if (n > 50)
2465 #endif
2466 {
2467 spv_size_t i, offset = 0, chunklen = 2*n - 1;
2468
2469 #if defined(_OPENMP)
2470 {
2471 const int nr_chunks = omp_get_num_threads();
2472 const int thread_nr = omp_get_thread_num();
2473
2474 chunklen = (chunklen - 1) / (spv_size_t) nr_chunks + 1;
2475 offset = (spv_size_t) thread_nr * chunklen;
2476 if (offset <= 2*n - 1)
2477 chunklen = MIN(chunklen, (2*n - 1) - offset);
2478 else
2479 chunklen = 0UL;
2480 }
2481 #endif
2482
2483 mpzspv_to_mpzv (dft, offset, R + offset, chunklen, ntt_context);
2484 for (i = offset; i < offset + chunklen; i++)
2485 mpz_mod (R[i], R[i], ntt_context->modulus);
2486 }
2487
2488 #ifdef TRACE_ntt_sqr_reciprocal
2489 gmp_printf ("ntt_sqr_reciprocal: Output polynomial is %Zd", R[0]);
2490 for (j = 1; (spv_size_t) j < 2*n - 1; j++)
2491 gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", R[j], j, j);
2492 printf ("\n");
2493 #endif
2494
2495 #ifdef WANT_ASSERT
2496 mpz_init (R_eval_1);
2497 /* Compute (S^2)(1) and compare to (S(1))^2 */
2498 list_recip_eval1 (R_eval_1, R, 2 * n - 1);
2499 mpz_mod (R_eval_1, R_eval_1, ntt_context->modulus);
2500 if (mpz_cmp (R_eval_1, S_eval_1) != 0)
2501 {
2502 gmp_fprintf (stderr, "ntt_sqr_reciprocal: (S(1))^2 = %Zd but "
2503 "(S^2)(1) = %Zd\n", S_eval_1, R_eval_1);
2504 #if 0
2505 gmp_printf ("Output polynomial is %Zd", R[0]);
2506 for (j = 1; (spv_size_t) j < 2*n - 1; j++)
2507 gmp_printf (" + %Zd * (x^%lu + x^(-%lu))", R[j], j, j);
2508 printf ("\n");
2509 #endif
2510 abort ();
2511 }
2512 mpz_clear (S_eval_1);
2513 mpz_clear (R_eval_1);
2514 #endif
2515 }
2516
2517
2518 /* Computes gcd(\prod_{0 <= i < len} (ntt[i + offset] + add[i]), N),
2519 the NTT residues are converted to integer residues (mod N) first.
2520 If add == NULL, add[i] is assumed to be 0. */
2521
2522 static void
ntt_gcd(mpz_t f,mpz_t * product,mpzspv_t ntt,const unsigned long ntt_offset,const listz_t add,const unsigned long len_param,const mpzspm_t ntt_context,mpmod_t modulus_param)2523 ntt_gcd (mpz_t f, mpz_t *product, mpzspv_t ntt, const unsigned long ntt_offset,
2524 const listz_t add, const unsigned long len_param,
2525 const mpzspm_t ntt_context, mpmod_t modulus_param)
2526 {
2527 unsigned long i, j;
2528 const unsigned long Rlen = MPZSPV_NORMALISE_STRIDE;
2529 listz_t R;
2530 unsigned long len = len_param, thread_offset = 0;
2531 mpres_t tmpres, tmpprod, totalprod;
2532 mpmod_t modulus;
2533 long timestart, realstart;
2534
2535 outputf (OUTPUT_VERBOSE, "Computing gcd of coefficients and N");
2536 timestart = cputime ();
2537 realstart = realtime ();
2538
2539 /* All the threads will multiply their partial products to this one. */
2540 mpres_init (totalprod, modulus_param);
2541 mpres_set_ui (totalprod, 1UL, modulus_param);
2542
2543 #ifdef _OPENMP
2544 #pragma omp parallel if (len > 100) private(i, j, R, len, thread_offset, tmpres, tmpprod, modulus) shared(totalprod)
2545 {
2546 const int nr_chunks = omp_get_num_threads();
2547 const int thread_nr = omp_get_thread_num();
2548
2549 len = (len_param - 1) / nr_chunks + 1;
2550 thread_offset = thread_nr * len;
2551 if (thread_offset <= len_param)
2552 len = MIN(len, len_param - thread_offset);
2553 else
2554 len = 0;
2555 #pragma omp master
2556 {
2557 outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
2558 }
2559 #endif
2560
2561 /* Make a private copy of the mpmod_t struct */
2562 mpmod_init_set (modulus, modulus_param);
2563
2564 R = init_list2 (Rlen, (mpz_size (modulus->orig_modulus) + 2) *
2565 GMP_NUMB_BITS);
2566 mpres_init (tmpres, modulus);
2567 mpres_init (tmpprod, modulus);
2568 mpres_set_ui (tmpprod, 1UL, modulus);
2569
2570 for (i = 0; i < len; i += Rlen)
2571 {
2572 const unsigned long blocklen = MIN(len - i, Rlen);
2573
2574 /* Convert blocklen residues from NTT to integer representatives
2575 and store them in R */
2576 mpzspv_to_mpzv (ntt, ntt_offset + thread_offset + i, R, blocklen,
2577 ntt_context);
2578
2579 /* Accumulate product in tmpprod */
2580 for (j = 0; j < blocklen; j++)
2581 {
2582 outputf (OUTPUT_TRACE, "r_%lu = %Zd; /* PARI */\n", i, R[j]);
2583 if (add != NULL)
2584 mpz_add (R[j], R[j], add[i + thread_offset + j]);
2585 mpres_set_z_for_gcd (tmpres, R[j], modulus);
2586 #define TEST_ZERO_RESULT
2587 #ifdef TEST_ZERO_RESULT
2588 if (mpres_is_zero (tmpres, modulus))
2589 outputf (OUTPUT_VERBOSE, "R_[%lu] = 0\n", i);
2590 #endif
2591 mpres_mul (tmpprod, tmpprod, tmpres, modulus);
2592 }
2593 }
2594 #ifdef _OPENMP
2595 #pragma omp critical
2596 {
2597 mpres_mul (totalprod, totalprod, tmpprod, modulus);
2598 }
2599 #else
2600 mpres_set (totalprod, tmpprod, modulus);
2601 #endif
2602 mpres_clear (tmpres, modulus);
2603 mpres_clear (tmpprod, modulus);
2604 mpmod_clear (modulus);
2605 clear_list (R, Rlen);
2606 #ifdef _OPENMP
2607 }
2608 #endif
2609
2610 {
2611 mpz_t n;
2612 mpz_init (n);
2613 mpz_set_ui (n, len_param);
2614 mpres_set_z_for_gcd_fix (totalprod, totalprod, n, modulus_param);
2615 mpz_clear (n);
2616 }
2617
2618 if (product != NULL)
2619 mpres_get_z (*product, totalprod, modulus_param);
2620
2621 mpres_gcd (f, totalprod, modulus_param);
2622 mpres_clear (totalprod, modulus_param);
2623
2624 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
2625 }
2626
2627
2628 int
pm1fs2(mpz_t f,const mpres_t X,mpmod_t modulus,const faststage2_param_t * params)2629 pm1fs2 (mpz_t f, const mpres_t X, mpmod_t modulus,
2630 const faststage2_param_t *params)
2631 {
2632 unsigned long phiP, nr;
2633 unsigned long i, l, lenF, lenG, lenR, tmplen;
2634 sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
2635 progressions of prime length */
2636 set_long_t *S_2; /* This is stored as a regular set */
2637 listz_t F; /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
2638 degree s_1. It is symmetric, so has only s_1 / 2 + 1
2639 distinct coefficients. The sequence h_j will be stored in
2640 the same memory and won't be a monic polynomial, so the
2641 leading 1 monomial of F will be stored explicitly. Hence we
2642 need s_1 / 2 + 1 entries. */
2643 listz_t g, h, tmp, R;
2644 mpz_t mt; /* All-purpose temp mpz_t */
2645 mpres_t mr; /* All-purpose temp mpres_t */
2646 int youpi = ECM_NO_FACTOR_FOUND;
2647 long timetotalstart, realtotalstart, timestart;
2648
2649 timetotalstart = cputime ();
2650 realtotalstart = realtime ();
2651
2652 phiP = eulerphi (params->P);
2653 ASSERT_ALWAYS (phiP == params->s_1 * params->s_2);
2654 ASSERT_ALWAYS (params->s_1 < params->l);
2655 nr = params->l - params->s_1; /* Number of points we evaluate */
2656
2657 if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
2658 return ECM_ERROR;
2659
2660 /* Allocate all the memory we'll need */
2661 /* Allocate the correct amount of space for each mpz_t or the
2662 reallocations will up to double the time for stage 2! */
2663 mpz_init (mt);
2664 mpres_init (mr, modulus);
2665 lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
2666 the leading 1 monomial for each factor */
2667 F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
2668 h = malloc ((params->s_1 + 1) * sizeof (mpz_t));
2669 if (h == NULL)
2670 {
2671 fprintf (stderr, "Cannot allocate memory in pm1fs2\n");
2672 exit (1);
2673 }
2674 lenG = params->l;
2675 g = init_list2 (lenG, (unsigned int) abs (modulus->bits));
2676 lenR = nr;
2677 R = init_list2 (lenR, (unsigned int) abs (modulus->bits));
2678 tmplen = 3UL * params->l + list_mul_mem (params->l / 2);
2679 outputf (OUTPUT_DEVVERBOSE, "tmplen = %lu\n", tmplen);
2680 if (TMulGen_space (params->l - 1, params->s_1, lenR) + 12 > tmplen)
2681 {
2682 tmplen = TMulGen_space (params->l - 1, params->s_1 - 1, lenR) + 12;
2683 /* FIXME: It appears TMulGen_space() returns a too small value! */
2684 outputf (OUTPUT_DEVVERBOSE, "With TMulGen_space, tmplen = %lu\n",
2685 tmplen);
2686 }
2687 #ifdef SHOW_TMP_USAGE
2688 tmp = init_list (tmplen);
2689 #else
2690 tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
2691 #endif
2692
2693 mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
2694 outputf (OUTPUT_TRACE,
2695 "N = %Zd; X = Mod(%Zd, N); /* PARI */\n",
2696 modulus->orig_modulus, mt);
2697
2698
2699 /* Compute the polynomial f(x) = \prod_{k_1 in S_1} (x - X^{2k_1}) */
2700 outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");
2701
2702 timestart = cputime ();
2703
2704 /* First compute X + 1/X */
2705 mpres_invert (mr, X, modulus);
2706 mpres_add (mr, mr, X, modulus);
2707
2708 i = poly_from_sets_V (F, mr, S_1, tmp, tmplen, modulus, NULL, NULL);
2709 ASSERT_ALWAYS(2 * i == params->s_1);
2710 ASSERT(mpz_cmp_ui (F[i], 1UL) == 0);
2711 free (S_1);
2712 S_1 = NULL;
2713
2714 outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
2715 if (test_verbose (OUTPUT_TRACE))
2716 {
2717 for (i = 0; i < params->s_1 / 2 + 1; i++)
2718 outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
2719 outputf (OUTPUT_TRACE, "f(x) = f_0");
2720 for (i = 1; i < params->s_1 / 2 + 1; i++)
2721 outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
2722 outputf (OUTPUT_TRACE, "/* PARI */ \n");
2723 }
2724
2725 mpz_set_ui (mt, params->P);
2726 mpres_pow (mr, X, mt, modulus); /* mr = X^P */
2727 pm1_sequence_h (F, NULL, F, mr, params->s_1 / 2 + 1, modulus, NULL);
2728
2729 /* Make a symmetric copy of F in h. It will have length
2730 s_1 + 1 = 2*lenF - 1 */
2731 /* I.e. with F = [3, 2, 1], s_1 = 4, we want h = [1, 2, 3, 2, 1] */
2732 for (i = 0; i < params->s_1 / 2 + 1; i++)
2733 *(h[i]) = *(F[params->s_1 / 2 - i]); /* Clone the mpz_t. */
2734 for (i = 0; i < params->s_1 / 2; i++)
2735 *(h[i + params->s_1 / 2 + 1]) = *(F[i + 1]);
2736 if (test_verbose (OUTPUT_TRACE))
2737 {
2738 for (i = 0; i < params->s_1 + 1; i++)
2739 outputf (OUTPUT_VERBOSE, "h_%lu = %Zd; /* PARI */\n", i, h[i]);
2740 outputf (OUTPUT_VERBOSE, "h(x) = h_0");
2741 for (i = 1; i < params->s_1 + 1; i++)
2742 outputf (OUTPUT_VERBOSE, " + h_%lu * x^%lu", i, i);
2743 outputf (OUTPUT_VERBOSE, " /* PARI */\n");
2744 }
2745
2746 for (l = 0; l < params->s_2; l++)
2747 {
2748 const unsigned long M = params->l - 1L - params->s_1 / 2L;
2749 outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
2750 l + 1, params->s_2);
2751 pm1_sequence_g (g, NULL, X, params->P, M, params->l,
2752 params->m_1, S_2->elem[l], modulus, NULL);
2753
2754 /* Do the convolution */
2755 /* Use the transposed "Middle Product" algorithm */
2756 /* TMulGen reverses the first input sequence, but that doesn't matter
2757 since h is symmetric. */
2758
2759 outputf (OUTPUT_VERBOSE, "TMulGen of g and h");
2760 timestart = cputime ();
2761 ASSERT(tmplen >= TMulGen_space (nr - 1, params->l - 1, params->s_1));
2762
2763 /* Computes rev(h)*g, stores coefficients of x^(s_1) to
2764 x^(s_1+nr-1) = x^(len-1) */
2765 if (TMulGen (R, nr - 1, h, params->s_1, g, params->l - 1, tmp,
2766 modulus->orig_modulus) < 0)
2767 {
2768 outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
2769 "of memory)\n");
2770 youpi = ECM_ERROR;
2771 break;
2772 }
2773 list_mod (R, R, nr, modulus->orig_modulus);
2774
2775 outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
2776
2777 #if 0 && defined(WANT_ASSERT)
2778
2779 /* See if R[i] is correct, with a test that works even if i0 != 0 */
2780 /* More expensive self-test */
2781 /* alpha = beta*(i0 + l*nr) */
2782 /* This code is old and probably does not work. */
2783
2784 outputf (OUTPUT_VERBOSE, "Verifying all results (slow)");
2785 for (i = 0; i < nr; i++)
2786 {
2787 mpz_set_ui (mt, nr * l);
2788 mpz_add (mt, mt, root_params->i0);
2789 mpz_add_ui (mt, mt, i);
2790 mpz_mul_ui (mt, mt, beta);
2791 mpres_get_z (tmp[0], X, modulus);
2792 mpz_powm (tmp[0], tmp[0], mt, modulus->orig_modulus);
2793 /* Hence, tmp[0] = X^(alpha + i * beta) */
2794 list_eval_poly (tmp[1], F, tmp[0], dF, 1, modulus->orig_modulus,
2795 tmp + 2);
2796
2797 mpz_set_ui (mt, i);
2798 mpz_mul_ui (mt, mt, i);
2799 mpz_mul_ui (mt, mt, beta / 2); /* h(i) = beta*i^2/2 */
2800 mpres_get_z (tmp[0], X, modulus);
2801 mpz_powm (tmp[0], tmp[0], mt, modulus->orig_modulus); /* X^h(1) */
2802 mpz_mul (tmp[0], tmp[0], R[i]);
2803 mpz_mod (tmp[0], tmp[0], modulus->orig_modulus);
2804 if (mpz_cmp (tmp[0], tmp[1]) != 0)
2805 {
2806 outputf (OUTPUT_ERROR, "Result in R[%ld] incorrect.\n", i);
2807 outputf (OUTPUT_ERROR, "R[%ld] = %Zd\n", i, R[i]);
2808 abort ();
2809 }
2810 }
2811 outputf (OUTPUT_VERBOSE, " - everything's correct! :-D\n");
2812 #endif
2813
2814 if (test_verbose (OUTPUT_TRACE))
2815 {
2816 for (i = 0; i < nr; i++)
2817 outputf (OUTPUT_TRACE, "r_%lu = %Zd; /* PARI */\n", i, R[i]);
2818 }
2819
2820 outputf (OUTPUT_VERBOSE, "Computing product of F(g_i)");
2821 timestart = cputime ();
2822
2823 {
2824 mpres_t tmpres, tmpprod;
2825 mpres_init (tmpres, modulus);
2826 mpres_init (tmpprod, modulus);
2827 mpres_set_z_for_gcd (tmpprod, R[0], modulus);
2828 for (i = 1; i < nr; i++)
2829 {
2830 mpres_set_z_for_gcd (tmpres, R[i], modulus);
2831 mpres_mul (tmpprod, tmpprod, tmpres, modulus);
2832 }
2833 mpres_get_z (tmp[1], tmpprod, modulus); /* For printing */
2834 mpres_gcd (tmp[0], tmpprod, modulus);
2835 mpres_clear (tmpprod, modulus);
2836 mpres_clear (tmpres, modulus);
2837 }
2838
2839 outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
2840 outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", tmp[1]);
2841
2842 if (mpz_cmp_ui (tmp[0], 1UL) > 0)
2843 {
2844 mpz_set (f, tmp[0]);
2845 youpi = ECM_FACTOR_FOUND_STEP2;
2846 break;
2847 }
2848 }
2849
2850 #ifdef SHOW_TMP_USAGE
2851 for (i = tmplen - 1; i > 0; i--)
2852 if (tmp[i]->_mp_alloc > 1)
2853 break;
2854 outputf (OUTPUT_DEVVERBOSE, "Highest used temp element is tmp[%lu]\n", i);
2855 #endif
2856
2857 free (S_2);
2858 free (h);
2859 clear_list (F, lenF);
2860 clear_list (g, lenG);
2861 clear_list (R, lenR);
2862 clear_list (tmp, tmplen);
2863
2864 mpz_clear (mt);
2865 mpres_clear (mr, modulus);
2866
2867 outputf (OUTPUT_NORMAL, "Step 2");
2868 /* In normal output mode, print only cpu time as we always have.
2869 In verbose mode, print real time as well if we used multi-threading */
2870 if (test_verbose (OUTPUT_VERBOSE))
2871 print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
2872 else
2873 print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
2874
2875 return youpi;
2876 }
2877
2878
2879 int
pm1fs2_ntt(mpz_t f,const mpres_t X,mpmod_t modulus,const faststage2_param_t * params)2880 pm1fs2_ntt (mpz_t f, const mpres_t X, mpmod_t modulus,
2881 const faststage2_param_t *params)
2882 {
2883 unsigned long nr;
2884 unsigned long l, lenF;
2885 sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
2886 progressions of prime length) */
2887 set_long_t *S_2; /* This is stored as a regular set */
2888 listz_t F; /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
2889 degree s_1. It is symmetric, so has only s_1 / 2 + 1
2890 distinct coefficients. The sequence h_j will be stored in
2891 the same memory and won't be a monic polynomial, so the
2892 leading 1 monomial of F will be stored explicitly. Hence we
2893 need s_1 / 2 + 1 entries. */
2894 mpzspm_t ntt_context;
2895 mpzspv_t g_ntt, h_ntt;
2896 mpz_t mt; /* All-purpose temp mpz_t */
2897 mpz_t product; /* Product of each multi-point evaluation */
2898 mpz_t *product_ptr = NULL;
2899 mpres_t tmpres; /* All-purpose temp mpres_t */
2900 int youpi = ECM_NO_FACTOR_FOUND;
2901 long timetotalstart, realtotalstart, timestart, realstart;
2902
2903 timetotalstart = cputime ();
2904 realtotalstart = realtime ();
2905
2906 ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
2907 ASSERT_ALWAYS (params->s_1 < params->l);
2908 nr = params->l - params->s_1; /* Number of points we evaluate */
2909
2910 /* Prepare NTT for computing the h sequence, its DCT-I, and the convolution
2911 with g. We need NTT of transform length l. We do it here at the start
2912 of stage 2 so that in case of a "not enough primes" condition,
2913 we don't have to wait until after F is built to get the error. */
2914
2915 ntt_context = mpzspm_init (params->l, modulus->orig_modulus);
2916 if (ntt_context == NULL)
2917 {
2918 outputf (OUTPUT_ERROR, "Could not initialise ntt_context, "
2919 "presumably out of memory\n");
2920 return ECM_ERROR;
2921 }
2922
2923 print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for evaluation = ",
2924 ntt_context);
2925
2926 if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
2927 return ECM_ERROR;
2928
2929 /* Allocate all the memory we'll need for building f */
2930 mpz_init (mt);
2931 mpres_init (tmpres, modulus);
2932 lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
2933 the leading 1 monomial for each factor */
2934 {
2935 /* in some cases the above value of lenF is not enough, for example with
2936 s_1 = 10, which gives lenF = 7, but 9 entries are needed */
2937 unsigned long mem = mem_poly_from_sets_V (S_1);
2938 if (mem > lenF)
2939 lenF = mem;
2940 }
2941
2942 F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
2943
2944 mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
2945 outputf (OUTPUT_TRACE,
2946 "N = %Zd; X = Mod(%Zd, N); /* PARI */\n",
2947 modulus->orig_modulus, mt);
2948
2949 #if 0 && defined (WANT_ASSERT)
2950 /* For this self test run with a large enough B2 so that enough memory
2951 is allocated for tmp and F_ntt, otherwise it segfaults. */
2952 {
2953 int testlen = 255;
2954 int i, j;
2955 /* A test of ntt_sqr_reciprocal() */
2956 for (j = 1; j <= testlen; j++)
2957 {
2958 outputf (OUTPUT_VERBOSE,
2959 "Testing ntt_sqr_reciprocal() for input degree %d\n",
2960 j - 1);
2961 for (i = 0; i < j; i++)
2962 mpz_set_ui (tmp[i], 1UL);
2963 ntt_sqr_reciprocal (tmp, tmp, F_ntt, (spv_size_t) j, ntt_context_F);
2964 for (i = 0; i < 2 * j - 1; i++)
2965 {
2966 ASSERT (mpz_cmp_ui (tmp[i], 2 * j - 1 - i) == 0);
2967 }
2968 }
2969 outputf (OUTPUT_VERBOSE,
2970 "Test of ntt_sqr_reciprocal() for input degree 2 ... %d passed\n",
2971 testlen - 1);
2972 }
2973 #endif
2974
2975
2976 /* First compute X + 1/X */
2977 mpres_invert (tmpres, X, modulus);
2978 mpres_add (tmpres, tmpres, X, modulus);
2979
2980 if (build_F_ntt (F, tmpres, S_1, params, modulus) == ECM_ERROR)
2981 {
2982 free (S_1);
2983 free (S_2);
2984 mpz_clear (mt);
2985 mpres_clear (tmpres, modulus);
2986 mpzspm_clear (ntt_context);
2987 clear_list (F, lenF);
2988 return ECM_ERROR;
2989 }
2990
2991 free (S_1);
2992 S_1 = NULL;
2993
2994 h_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
2995
2996 mpz_set_ui (mt, params->P);
2997 mpres_pow (tmpres, X, mt, modulus); /* tmpres = X^P */
2998 pm1_sequence_h (NULL, h_ntt, F, tmpres, params->s_1 / 2 + 1, modulus,
2999 ntt_context);
3000
3001 clear_list (F, lenF);
3002 g_ntt = mpzspv_init (params->l, ntt_context);
3003
3004 /* Compute the DCT-I of h */
3005 outputf (OUTPUT_VERBOSE, "Computing DCT-I of h");
3006 #ifdef _OPENMP
3007 outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
3008 #endif
3009 timestart = cputime ();
3010 realstart = realtime ();
3011
3012 mpzspv_to_dct1 (h_ntt, h_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
3013 g_ntt, ntt_context);
3014 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
3015
3016 if (test_verbose (OUTPUT_RESVERBOSE))
3017 {
3018 mpz_init (product);
3019 product_ptr = &product;
3020 }
3021
3022 for (l = 0; l < params->s_2; l++)
3023 {
3024 const unsigned long M = params->l - 1L - params->s_1 / 2L;
3025
3026 outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
3027 l + 1, params->s_2);
3028 /* Compute the coefficients of the polynomial g(x) */
3029 pm1_sequence_g (NULL, g_ntt, X, params->P, M, params->l,
3030 params->m_1, S_2->elem[l], modulus, ntt_context);
3031
3032 /* Do the convolution */
3033 outputf (OUTPUT_VERBOSE, "Computing g*h");
3034 #ifdef _OPENMP
3035 outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
3036 #endif
3037 timestart = cputime ();
3038 realstart = realtime ();
3039 mpzspv_mul_by_dct (g_ntt, h_ntt, params->l, ntt_context,
3040 NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
3041 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
3042
3043 /* Compute GCD of N and coefficients of product polynomial */
3044 ntt_gcd (mt, product_ptr, g_ntt, params->s_1 / 2, NULL, nr, ntt_context,
3045 modulus);
3046
3047 outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", product);
3048
3049 /* If we found a factor, stop */
3050 if (mpz_cmp_ui (mt, 1UL) > 0)
3051 {
3052 mpz_set (f, mt);
3053 youpi = ECM_FACTOR_FOUND_STEP2;
3054 break;
3055 }
3056 }
3057
3058 if (test_verbose (OUTPUT_RESVERBOSE))
3059 {
3060 product_ptr = NULL;
3061 mpz_clear (product);
3062 }
3063 mpzspv_clear (g_ntt, ntt_context);
3064 mpzspv_clear (h_ntt, ntt_context);
3065 mpzspm_clear (ntt_context);
3066 mpres_clear (tmpres, modulus);
3067 mpz_clear (mt);
3068 free (S_2);
3069
3070 outputf (OUTPUT_NORMAL, "Step 2");
3071 /* In normal output mode, print only cpu time as we always have.
3072 In verbose mode, print real time as well if we used multi-threading */
3073 if (test_verbose (OUTPUT_VERBOSE))
3074 print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
3075 else
3076 print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
3077
3078 return youpi;
3079 }
3080
3081
3082 static void
gfp_ext_print(const mpres_t r_x,const mpres_t r_y,mpmod_t modulus,const int verbose)3083 gfp_ext_print (const mpres_t r_x, const mpres_t r_y, mpmod_t modulus,
3084 const int verbose)
3085 {
3086 mpz_t t1, t2;
3087
3088 if (!test_verbose (verbose))
3089 return;
3090
3091 mpz_init (t1);
3092 mpz_init (t2);
3093 mpres_get_z (t1, r_x, modulus);
3094 mpres_get_z (t2, r_y, modulus);
3095 outputf (verbose, "Mod(%Zd, N) + Mod(%Zd, N) * w", t1, t2);
3096
3097 mpz_clear (t1);
3098 mpz_clear (t2);
3099 }
3100
3101
3102
3103 /* Multiplies (a_0 + a_1*sqrt(Delta)) * (b_0 + b_1*sqrt(Delta))
3104 using four multiplications. Result goes in (r_0 + r_1*sqrt(Delta)).
3105 a_0, b_0, r_0 as well as a_1, b_1, r_1 may overlap arbitrarily. t[0], t[1],
3106 t[2] and Delta must not overlap with anything. */
3107 /* FIXME: is there a faster multiplication routine if both inputs have
3108 norm 1? */
3109
3110 static void
gfp_ext_mul(mpres_t r_0,mpres_t r_1,const mpres_t a_0,const mpres_t a_1,const mpres_t b_0,const mpres_t b_1,const mpres_t Delta,mpmod_t modulus,ATTRIBUTE_UNUSED const unsigned long tmplen,mpres_t * tmp)3111 gfp_ext_mul (mpres_t r_0, mpres_t r_1, const mpres_t a_0, const mpres_t a_1,
3112 const mpres_t b_0, const mpres_t b_1, const mpres_t Delta,
3113 mpmod_t modulus, ATTRIBUTE_UNUSED const unsigned long tmplen,
3114 mpres_t *tmp)
3115 {
3116 ASSERT (tmplen >= 2);
3117 if (0 && test_verbose (OUTPUT_TRACE))
3118 {
3119 mpz_t t;
3120 mpz_init (t);
3121 mpres_get_z (t, Delta, modulus);
3122 outputf (OUTPUT_TRACE, "/* gfp_ext_mul */ w = quadgen (4*%Zd); "
3123 "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
3124 mpz_clear (t);
3125 outputf (OUTPUT_TRACE, "/* gfp_ext_mul */ (");
3126 gfp_ext_print (a_0, a_1, modulus, OUTPUT_TRACE);
3127 outputf (OUTPUT_TRACE, ") * (");
3128 gfp_ext_print (b_0, b_1, modulus, OUTPUT_TRACE);
3129 }
3130
3131 mpres_add (tmp[0], a_0, a_1, modulus);
3132 mpres_add (tmp[1], b_0, b_1, modulus);
3133 mpres_mul (tmp[1], tmp[0], tmp[1], modulus); /* t[1] = (a_0+a_1)*(b_0+b_1) =
3134 a_0*b_0 + a_0*b_1 + a_1*b_0 +
3135 a_1*b_1 */
3136
3137 mpres_mul (r_0, a_0, b_0, modulus); /* r_0 = a_0*b_0. We don't need a_0
3138 or b_0 any more now */
3139 mpres_sub (tmp[1], tmp[1], r_0, modulus); /* t[1] = a_0*b_1 + a_1*b_0 +
3140 a_1*b_1 */
3141
3142 mpres_mul (tmp[0], a_1, b_1, modulus); /* t[0] = a_1*b_1. We don't need
3143 a_1 or b_1 any more now */
3144 mpres_sub (r_1, tmp[1], tmp[0], modulus); /* r_1 == a_0*b_1 + a_1*b_0 */
3145
3146 mpres_mul (tmp[0], tmp[0], Delta, modulus); /* t[0] = a_1*b_1*Delta */
3147 mpres_add (r_0, r_0, tmp[0], modulus); /* r_0 = a_0*b_0 + a_1*b_1*Delta */
3148
3149 if (0 && test_verbose (OUTPUT_TRACE))
3150 {
3151 outputf (OUTPUT_TRACE, ") == ");
3152 gfp_ext_print (r_0, r_1, modulus, OUTPUT_TRACE);
3153 outputf (OUTPUT_TRACE, " /* PARI C */\n");
3154 }
3155 }
3156
3157
3158 /* Computes (a_0 + a_1 * sqrt(Delta))^2, where the norm
3159 (a_0^2 - a_1^2*Delta) is assumed to be equal to 1. Hence
3160 (a_0 + a_1 * sqrt(Delta))^2 = a_0^2 + 2*a_0*a_1*sqrt(Delta) + a_1^2*Delta
3161 and a_0^2 + a_1^2*Delta = a_0^2 + a_1^2*Delta + norm - 1 = 2*a_0^2 - 1.
3162 a_0 and r_0, as well as a_1 and r_1 may overlap */
3163
3164 static void
gfp_ext_sqr_norm1(mpres_t r_0,mpres_t r_1,const mpres_t a_0,const mpres_t a_1,mpmod_t modulus)3165 gfp_ext_sqr_norm1 (mpres_t r_0, mpres_t r_1, const mpres_t a_0,
3166 const mpres_t a_1, mpmod_t modulus)
3167 {
3168 ASSERT (a_0 != r_1); /* a_0 is read after r_1 is written */
3169
3170 if (pari)
3171 gmp_printf ("/* gfp_ext_sqr_norm1 */ (%Zd + %Zd * w)^2 %% N == ", a_0, a_1);
3172
3173 mpres_mul (r_1, a_0, a_1, modulus);
3174 mpres_add (r_1, r_1, r_1, modulus); /* r_1 = 2*a_0*a_1 */
3175
3176 mpres_sqr (r_0, a_0, modulus);
3177 mpres_add (r_0, r_0, r_0, modulus);
3178 mpres_sub_ui (r_0, r_0, 1UL, modulus); /* r_0 = 2*a_0^2 - 1 */
3179
3180 if (pari)
3181 gmp_printf ("(%Zd + %Zd * w) %% N /* PARI C */\n", r_0, r_1);
3182 }
3183
3184
3185 /* Raise (a0 + a1*sqrt(Delta)) to the power e which is a signed long int.
3186 (a0 + a1*sqrt(Delta)) is assumed to have norm 1, i.e.
3187 a0^2 - a1^2*Delta == 1. The result is (r0 * r1*sqrt(Delta)).
3188 a0, a1, r0 and r1 must not overlap */
3189
3190 static void
gfp_ext_pow_norm1_sl(mpres_t r0,mpres_t r1,const mpres_t a0,const mpres_t a1,const long e,const mpres_t Delta,mpmod_t modulus,unsigned long tmplen,mpres_t * tmp)3191 gfp_ext_pow_norm1_sl (mpres_t r0, mpres_t r1, const mpres_t a0,
3192 const mpres_t a1, const long e, const mpres_t Delta,
3193 mpmod_t modulus, unsigned long tmplen, mpres_t *tmp)
3194 {
3195 const unsigned long abs_e = labs (e);
3196 unsigned long mask = ~0UL - (~0UL >> 1);
3197
3198 ASSERT (a0 != r0 && a1 != r0 && a0 != r1 && a1 != r1);
3199
3200 if (e == 0)
3201 {
3202 mpres_set_ui (r0, 1UL, modulus);
3203 mpres_set_ui (r1, 0UL, modulus);
3204 return;
3205 }
3206
3207 /* If e < 0, we want 1/(a0 + a1*sqrt(Delta)). By extending with
3208 a0 - a1*sqrt(Delta), we get
3209 (a0 - a1*sqrt(Delta)) / (a0^2 - a1^2 * Delta), but that denomiator
3210 is the norm which is known to be 1, so the result is
3211 a0 - a1*sqrt(Delta). */
3212
3213 while ((abs_e & mask) == 0UL)
3214 mask >>= 1;
3215
3216 mpres_set (r0, a0, modulus);
3217 mpres_set (r1, a1, modulus);
3218
3219 while (mask > 1UL)
3220 {
3221 gfp_ext_sqr_norm1 (r0, r1, r0, r1, modulus);
3222 mask >>= 1;
3223 if (abs_e & mask)
3224 gfp_ext_mul (r0, r1, r0, r1, a0, a1, Delta, modulus, tmplen, tmp);
3225 }
3226
3227 if (e < 0)
3228 mpres_neg (r1, r1, modulus);
3229
3230 if (0 && test_verbose (OUTPUT_TRACE))
3231 {
3232 mpz_t t;
3233 mpz_init (t);
3234 mpres_get_z (t, Delta, modulus);
3235 outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1_sl */ w = quadgen (4*%Zd); "
3236 "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
3237 mpz_clear (t);
3238 outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1_sl */ (");
3239 gfp_ext_print (a0, a1, modulus, OUTPUT_TRACE);
3240 outputf (OUTPUT_TRACE, ")^(%ld) == ", e);
3241 gfp_ext_print (r0, r1, modulus, OUTPUT_TRACE);
3242 outputf (OUTPUT_TRACE, " /* PARI C */\n");
3243 }
3244 }
3245
3246
3247 /* Same, but taking an mpz_t argument for the exponent */
3248
3249 static void
gfp_ext_pow_norm1(mpres_t r0,mpres_t r1,const mpres_t a0,const mpres_t a1,mpz_t e,const mpres_t Delta,mpmod_t modulus,unsigned long tmplen,mpres_t * tmp)3250 gfp_ext_pow_norm1 (mpres_t r0, mpres_t r1, const mpres_t a0,
3251 const mpres_t a1, mpz_t e, const mpres_t Delta,
3252 mpmod_t modulus, unsigned long tmplen, mpres_t *tmp)
3253 {
3254 mpz_t abs_e;
3255 unsigned long idx;
3256
3257 ASSERT (a0 != r0 && a1 != r0 && a0 != r1 && a1 != r1);
3258
3259 if (mpz_sgn (e) == 0)
3260 {
3261 mpres_set_ui (r0, 1UL, modulus);
3262 mpres_set_ui (r1, 0UL, modulus);
3263 return;
3264 }
3265
3266 mpz_init (abs_e);
3267 mpz_abs (abs_e, e);
3268 idx = mpz_sizeinbase (abs_e, 2) - 1; /* Thus mpz_tstbit (abs_e, idx) == 1 */
3269 ASSERT (mpz_tstbit (abs_e, idx) == 1);
3270
3271 mpres_set (r0, a0, modulus);
3272 mpres_set (r1, a1, modulus);
3273
3274 while (idx > 0UL)
3275 {
3276 gfp_ext_sqr_norm1 (r0, r1, r0, r1, modulus);
3277 idx--;
3278 if (mpz_tstbit (abs_e, idx))
3279 gfp_ext_mul (r0, r1, r0, r1, a0, a1, Delta, modulus, tmplen, tmp);
3280 }
3281
3282 if (mpz_sgn (e) < 0)
3283 mpres_neg (r1, r1, modulus);
3284
3285 mpz_clear (abs_e);
3286
3287 if (test_verbose (OUTPUT_TRACE))
3288 {
3289 mpz_t t;
3290 mpz_init (t);
3291 mpres_get_z (t, Delta, modulus);
3292 outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1 */ w = quadgen (4*%Zd); "
3293 "N = %Zd; /* PARI */\n", t, modulus->orig_modulus);
3294 mpz_clear (t);
3295 outputf (OUTPUT_TRACE, "/* gfp_ext_pow_norm1 */ (");
3296 gfp_ext_print (a0, a1, modulus, OUTPUT_TRACE);
3297 outputf (OUTPUT_TRACE, ")^(%Zd) == ", e);
3298 gfp_ext_print (r0, r1, modulus, OUTPUT_TRACE);
3299 outputf (OUTPUT_TRACE, " /* PARI C */\n");
3300 }
3301 }
3302
3303 #if 0
3304 /* Compute r[i] = a^((k+i)^2) for i = 0, 1, ..., l-1, where "a" is an
3305 element of norm 1 in the quadratic extension ring */
3306
3307 static void
3308 gfp_ext_rn2 (mpres_t *r_x, mpres_t *r_y, const mpres_t a_x, const mpres_t a_y,
3309 const long k, const unsigned long l, const mpres_t Delta,
3310 mpmod_t modulus, const unsigned long origtmplen, mpres_t *origtmp)
3311 {
3312 mpres_t *r2_x = origtmp, *r2_y = origtmp + 2, *v = origtmp + 4,
3313 *V2 = origtmp + 6;
3314 const unsigned long newtmplen = origtmplen - 7;
3315 mpres_t *newtmp = origtmp + 7;
3316 unsigned long i;
3317
3318 if (l == 0UL)
3319 return;
3320
3321 ASSERT (origtmplen >= 8UL);
3322
3323 if (pari)
3324 gmp_printf ("/* In gfp_ext_rn2 */ ; a = %Zd + %Zd * w; /* PARI */\n",
3325 a_x, a_y, modulus->orig_modulus);
3326
3327 /* Compute r[0] = a^(k^2). We do it by two exponentiations by k and use
3328 v[0] and v[1] as temp storage */
3329 gfp_ext_pow_norm1_sl (v[0], v[1], a_x, a_y, k, Delta, modulus, newtmplen,
3330 newtmp);
3331 gfp_ext_pow_norm1_sl (r_x[0], r_y[0], v[0], v[1], k, Delta, modulus,
3332 newtmplen, newtmp);
3333 if (pari)
3334 gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2) %% N == (%Zd + %Zd * w) %% N "
3335 "/* PARI C */\n", k, r_x[0], r_y[0]);
3336
3337 /* Compute r[1] = a^((k+1)^2) = a^(k^2 + 2k + 1)*/
3338 if (l > 1)
3339 {
3340 /* v[0] + v[1]*sqrt(Delta) still contains a^k */
3341 gfp_ext_sqr_norm1 (r_x[1], r_y[1], v[0], v[1], modulus);
3342 /* Now r[1] = a^(2k) */
3343 gfp_ext_mul (r_x[1], r_y[1], r_x[1], r_y[1], r_x[0], r_y[0], Delta,
3344 modulus, newtmplen, newtmp);
3345 /* Now r[1] = a^(k^2 + 2k) */
3346 gfp_ext_mul (r_x[1], r_y[1], r_x[1], r_y[1], a_x, a_y, Delta, modulus,
3347 newtmplen, newtmp);
3348 /* Now r[1] = a^(k^2 + 2k + 1) = a^((k+1)^2) */
3349 }
3350 if (pari)
3351 gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2) %% N == (%Zd + %Zd * w) %% N "
3352 "/* PARI C */\n", k + 1, r_x[1], r_y[1]);
3353
3354 /* Compute r2[0] = a^(k^2+2) = a^(k^2) * a^2 */
3355 gfp_ext_sqr_norm1 (v[0], v[1], a_x, a_y, modulus);
3356 gfp_ext_mul (r2_x[0], r2_y[0], r_x[0], r_y[0], v[0], v[1], Delta, modulus,
3357 newtmplen, newtmp);
3358 if (pari)
3359 gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2+2) %% N == (%Zd + %Zd * w) %% N "
3360 "/* PARI C */\n", k, r2_x[0], r2_y[0]);
3361 /* Compute a^((k+1)^2+2) = a^((k+1)^2) * a^2 */
3362 gfp_ext_mul (r2_x[1], r2_y[1], r_x[1], r_y[1], v[0], v[1], Delta, modulus,
3363 newtmplen, newtmp);
3364 if (pari)
3365 gmp_printf ("/* In gfp_ext_rn2 */ a^(%ld^2+2) %% N == (%Zd + %Zd * w) %% N "
3366 "/* PARI C */\n", k + 1, r2_x[1], r2_y[1]);
3367
3368 /* Compute V_2(a + 1/a). Since 1/a = a_x - a_y, we have a+1/a = 2*a_x.
3369 V_2(x) = x^2 - 2, so we want 4*a_x^2 - 2. */
3370 mpres_add (*V2, a_x, a_x, modulus); /* V2 = a + 1/a = 2*a_x*/
3371 V (v[0], *V2, 2 * k + 1, modulus); /* v[0] = V_{2k+1} (a + 1/a) */
3372 V (v[1], *V2, 2 * k + 3, modulus); /* v[0] = V_{2k+3} (a + 1/a) */
3373 mpres_sqr (*V2, *V2, modulus); /* V2 = 4*a_x^2 */
3374 mpres_sub_ui (*V2, *V2, 2UL, modulus); /* V2 = 4*a_x^2 - 2 */
3375 if (pari)
3376 {
3377 gmp_printf ("/* In gfp_ext_rn2 */ ((a + 1/a)^2 - 2) %% N == "
3378 "%Zd %% N /* PARI C */\n", *V2);
3379 gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
3380 "/* PARI C */\n", 2 * k + 1, v[0]);
3381 gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
3382 "/* PARI C */\n", 2 * k + 3, v[1]);
3383 }
3384
3385 /* Compute the remaining a^((k+i)^2) values according to Peter's
3386 recurrence */
3387
3388 for (i = 2; i < l; i++)
3389 {
3390 /* r[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i taken
3391 modulo 2 */
3392 mpres_mul (r_x[i], r2_x[1 - i % 2], v[i % 2], modulus);
3393 mpres_sub (r_x[i], r_x[i], r2_x[i % 2], modulus);
3394 mpres_mul (r_y[i], r2_y[1 - i % 2], v[i % 2], modulus);
3395 mpres_sub (r_y[i], r_y[i], r2_y[i % 2], modulus);
3396
3397 /* r2[i] = r2[i-1] * v[i-1] - r[i-2] */
3398 mpres_mul (r2_x[i % 2], r2_x[1 - i % 2], v[1 - i % 2], modulus);
3399 mpres_sub (r2_x[i % 2], r2_x[i % 2], r_x[i - 2], modulus);
3400 mpres_mul (r2_y[i % 2], r2_y[1 - i % 2], v[1 - i % 2], modulus);
3401 mpres_sub (r2_y[i % 2], r2_y[i % 2], r_y[i - 2], modulus);
3402
3403 /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
3404 mpres_mul (newtmp[0], v[1 - i % 2], *V2, modulus);
3405 mpres_sub (v[i % 2], newtmp[0], v[i % 2], modulus);
3406 if (pari)
3407 gmp_printf ("/* In gfp_ext_rn2 */ V(%lu, a + 1/a) %% N == %Zd %% N "
3408 "/* PARI C */\n", 2 * (k + i) + 1, v[i % 2]);
3409 }
3410 }
3411 #endif
3412
3413 /* Compute g_i = x_0^{M-i} * r^{(M-i)^2} for 0 <= i < l.
3414 x_0 = b_1^{2*k_2 + (2*m_1 + 1) * P}. r = b_1^P. */
3415
3416 static void
pp1_sequence_g(listz_t g_x,listz_t g_y,mpzspv_t g_x_ntt,mpzspv_t g_y_ntt,const mpres_t b1_x,const mpres_t b1_y,const unsigned long P,const mpres_t Delta,const long M_param,const unsigned long l_param,const mpz_t m_1,const long k_2,const mpmod_t modulus_param,const mpzspm_t ntt_context)3417 pp1_sequence_g (listz_t g_x, listz_t g_y, mpzspv_t g_x_ntt, mpzspv_t g_y_ntt,
3418 const mpres_t b1_x, const mpres_t b1_y, const unsigned long P,
3419 const mpres_t Delta, const long M_param,
3420 const unsigned long l_param, const mpz_t m_1, const long k_2,
3421 const mpmod_t modulus_param, const mpzspm_t ntt_context)
3422 {
3423 const unsigned long tmplen = 3;
3424 const int want_x = (g_x != NULL || g_x_ntt != NULL);
3425 const int want_y = (g_y != NULL || g_y_ntt != NULL);
3426 mpres_t r_x, r_y, x0_x, x0_y, v2,
3427 r1_x[2], r1_y[2], r2_x[2], r2_y[2],
3428 v[2], tmp[3];
3429 mpz_t mt;
3430 mpmod_t modulus; /* Thread-local copy of modulus_param */
3431 unsigned long i, l = l_param, offset = 0;
3432 long M = M_param;
3433 long timestart, realstart;
3434 int want_output = 1;
3435
3436 outputf (OUTPUT_VERBOSE, "Computing %s%s%s",
3437 (want_x) ? "g_x" : "",
3438 (want_x && want_y) ? " and " : "",
3439 (want_y) ? "g_y" : "");
3440 timestart = cputime ();
3441 realstart = realtime ();
3442
3443 #ifdef _OPENMP
3444 #pragma omp parallel if (l > 100) private(r_x, r_y, x0_x, x0_y, v2, r1_x, r1_y, r2_x, r2_y, v, tmp, mt, modulus, i, l, offset, M, want_output)
3445 {
3446 /* When multi-threading, we adjust the parameters for each thread */
3447
3448 const int nr_chunks = omp_get_num_threads();
3449 const int thread_nr = omp_get_thread_num();
3450
3451 l = (l_param - 1) / nr_chunks + 1;
3452 offset = thread_nr * l;
3453 if (offset <= l_param)
3454 l = MIN(l, l_param - offset);
3455 else
3456 l = 0;
3457 M = M_param - (long) offset;
3458
3459 want_output = (omp_get_thread_num() == 0);
3460 if (want_output)
3461 outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
3462 #endif
3463 mpmod_init_set (modulus, modulus_param);
3464 mpres_init (r_x, modulus);
3465 mpres_init (r_y, modulus);
3466 mpres_init (x0_x, modulus);
3467 mpres_init (x0_y, modulus);
3468 mpres_init (v2, modulus);
3469 for (i = 0; i < 2UL; i++)
3470 {
3471 mpres_init (r1_x[i], modulus);
3472 mpres_init (r1_y[i], modulus);
3473 mpres_init (r2_x[i], modulus);
3474 mpres_init (r2_y[i], modulus);
3475 mpres_init (v[i], modulus);
3476 }
3477 for (i = 0; i < tmplen; i++)
3478 mpres_init (tmp[i], modulus);
3479 mpz_init (mt);
3480
3481 if (want_output && test_verbose (OUTPUT_TRACE))
3482 {
3483 mpres_get_z (mt, Delta, modulus);
3484 outputf (OUTPUT_TRACE,
3485 "\n/* pp1_sequence_g */ w = quadgen (4*%Zd); P = %lu; "
3486 "M = %ld; k_2 = %ld; m_1 = %Zd; N = %Zd; /* PARI */\n",
3487 mt, P, M, k_2, m_1, modulus->orig_modulus);
3488
3489 outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ b_1 = ");
3490 gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
3491 outputf (OUTPUT_TRACE, "; /* PARI */\n");
3492 outputf (OUTPUT_TRACE,
3493 "/* pp1_sequence_g */ r = b_1^P; /* PARI */\n");
3494 outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ "
3495 "x_0 = b_1^(2*k_2 + (2*m_1 + 1) * P); /* PARI */\n");
3496 outputf (OUTPUT_TRACE,
3497 "/* pp1_sequence_g */ addrec(x) = x + 1/x; /* PARI */\n");
3498 }
3499
3500 /* Compute r */
3501 gfp_ext_pow_norm1_sl (r_x, r_y, b1_x, b1_y, P, Delta, modulus,
3502 tmplen, tmp);
3503 if (want_output && test_verbose (OUTPUT_TRACE))
3504 {
3505 outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ r == ");
3506 gfp_ext_print (r_x, r_y, modulus, OUTPUT_TRACE);
3507 outputf (OUTPUT_TRACE, " /* PARI C */\n");
3508 }
3509
3510 /* Compute x0 = x_0 */
3511 mpz_mul_2exp (mt, m_1, 1UL);
3512 mpz_add_ui (mt, mt, 1UL);
3513 mpz_mul_ui (mt, mt, P);
3514 mpz_add_si (mt, mt, k_2);
3515 mpz_add_si (mt, mt, k_2); /* mt = 2*k_2 + (2*m_1 + 1) * P */
3516 gfp_ext_pow_norm1 (x0_x, x0_y, b1_x, b1_y, mt, Delta, modulus,
3517 tmplen, tmp);
3518 if (want_output && test_verbose (OUTPUT_TRACE))
3519 {
3520 outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ x_0 == ");
3521 gfp_ext_print (x0_x, x0_y, modulus, OUTPUT_TRACE);
3522 outputf (OUTPUT_TRACE, " /* PARI C */\n");
3523 }
3524
3525
3526 /* Compute g[1] = r1[0] = x0^M * r^(M^2) = (x0 * r^M)^M.
3527 We use v[0,1] as temporary storage */
3528 gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, M, Delta, modulus,
3529 tmplen, tmp); /* v[0,1] = r^M */
3530 gfp_ext_mul (v[0], v[1], v[0], v[1], x0_x, x0_y, Delta, modulus,
3531 tmplen, tmp); /* v[0,1] = r^M * x_0 */
3532 gfp_ext_pow_norm1_sl (r1_x[0], r1_y[0], v[0], v[1], M, Delta, modulus,
3533 tmplen, tmp); /* r1[0] = (r^M * x_0)^M */
3534 if (l > 0)
3535 {
3536 if (g_x != NULL)
3537 mpres_get_z (g_x[offset], r1_x[0], modulus);
3538 if (g_y != NULL)
3539 mpres_get_z (g_y[offset], r1_y[0], modulus);
3540 if (g_x_ntt != NULL)
3541 {
3542 mpres_get_z (mt, r1_x[0], modulus);
3543 mpzspv_from_mpzv (g_x_ntt, offset, &mt, 1UL, ntt_context);
3544 }
3545 if (g_y_ntt != NULL)
3546 {
3547 mpres_get_z (mt, r1_y[0], modulus);
3548 mpzspv_from_mpzv (g_y_ntt, offset, &mt, 1UL, ntt_context);
3549 }
3550 }
3551
3552
3553 /* Compute g[1] = r1[1] = x0^(M-1) * r^((M-1)^2) = (x0 * r^(M-1))^(M-1).
3554 We use v[0,1] as temporary storage. FIXME: simplify, reusing g_0 */
3555 gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, M - 1, Delta, modulus,
3556 tmplen, tmp);
3557 gfp_ext_mul (v[0], v[1], v[0], v[1], x0_x, x0_y, Delta, modulus,
3558 tmplen, tmp);
3559 gfp_ext_pow_norm1_sl (r1_x[1], r1_y[1], v[0], v[1], M - 1, Delta,
3560 modulus, tmplen, tmp);
3561 if (l > 1)
3562 {
3563 if (g_x != NULL)
3564 mpres_get_z (g_x[offset + 1], r1_x[1], modulus);
3565 if (g_y != NULL)
3566 mpres_get_z (g_y[offset + 1], r1_y[1], modulus);
3567 if (g_x_ntt != NULL)
3568 {
3569 mpres_get_z (mt, r1_x[1], modulus);
3570 mpzspv_from_mpzv (g_x_ntt, offset + 1, &mt, 1UL, ntt_context);
3571 }
3572 if (g_y_ntt != NULL)
3573 {
3574 mpres_get_z (mt, r1_y[1], modulus);
3575 mpzspv_from_mpzv (g_y_ntt, offset + 1, &mt, 1UL, ntt_context);
3576 }
3577 }
3578
3579
3580 /* x0 := $x_0 * r^{2M - 3}$ */
3581 /* We don't need x0 after this so we overwrite it. We use v[0,1] as
3582 temp storage for $r^{2M - 3}$. */
3583 gfp_ext_pow_norm1_sl (v[0], v[1], r_x, r_y, 2UL*M - 3UL, Delta, modulus,
3584 tmplen, tmp);
3585 gfp_ext_mul (x0_x, x0_y, x0_x, x0_y, v[0], v[1], Delta, modulus,
3586 tmplen, tmp);
3587
3588 /* Compute r2[0] = r1[0] * r^2 and r2[1] = r1[1] * r^2. */
3589 /* We only need $r^2$ from here on, so we set r = $r^2$ */
3590 gfp_ext_sqr_norm1 (r_x, r_y, r_x, r_y, modulus);
3591 gfp_ext_mul (r2_x[0], r2_y[0], r1_x[0], r1_y[0], r_x, r_y, Delta,
3592 modulus, tmplen, tmp);
3593 gfp_ext_mul (r2_x[1], r2_y[1], r1_x[1], r1_y[1], r_x, r_y, Delta,
3594 modulus, tmplen, tmp);
3595
3596 /* v[1] := $x_0 * r^{2*M - 3} + 1/(x_0 * r^{2M - 3}) */
3597 mpres_add (v[1], x0_x, x0_x, modulus);
3598 /* x0 := x0 * r = $x_0 * r^{2M - 1}$ */
3599 gfp_ext_mul (x0_x, x0_y, x0_x, x0_y, r_x, r_y, Delta, modulus,
3600 tmplen, tmp);
3601 /* v[0] := $x_0 * r^{2M - 1} + 1/(x_0 * r^{2M - 1}) */
3602 mpres_add (v[0], x0_x, x0_x, modulus);
3603
3604 /* v2 = V_2 (r + 1/r) = r^2 + 1/r^2 */
3605 mpres_add (v2, r_x, r_x, modulus);
3606
3607 /* We don't need the contents of r any more and use it as a temp var */
3608
3609 for (i = 2; i < l; i++)
3610 {
3611 if (want_x)
3612 {
3613 /* r1[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i
3614 taken modulo 2. We store the new r1_x[i] in r_x for now */
3615 mpres_mul (r_x, r2_x[1 - i % 2], v[i % 2], modulus);
3616 mpres_sub (r_x, r_x, r2_x[i % 2], modulus);
3617 /* r2[i] = r2[i-1] * v[i-1] - r1[i-2] */
3618 mpres_mul (r2_x[i % 2], r2_x[1 - i % 2], v[1 - i % 2], modulus);
3619 mpres_sub (r2_x[i % 2], r2_x[i % 2], r1_x[i % 2], modulus);
3620 mpres_set (r1_x[i % 2], r_x, modulus); /* FIXME, avoid this copy */
3621 if (g_x != NULL)
3622 mpres_get_z (g_x[offset + i], r_x, modulus); /* FIXME, avoid these REDC */
3623 if (g_x_ntt != NULL)
3624 {
3625 mpres_get_z (mt, r_x, modulus);
3626 mpzspv_from_mpzv (g_x_ntt, offset + i, &mt, 1UL, ntt_context);
3627 }
3628 }
3629
3630 if (want_y)
3631 {
3632 /* Same for y coordinate */
3633 mpres_mul (r_y, r2_y[1 - i % 2], v[i % 2], modulus);
3634 mpres_sub (r_y, r_y, r2_y[i % 2], modulus);
3635 mpres_mul (r2_y[i % 2], r2_y[1 - i % 2], v[1 - i % 2], modulus);
3636 mpres_sub (r2_y[i % 2], r2_y[i % 2], r1_y[i % 2], modulus);
3637 mpres_set (r1_y[i % 2], r_y, modulus);
3638 if (g_y != NULL)
3639 mpres_get_z (g_y[offset + i], r_y, modulus); /* Keep r1, r2 in mpz_t ? */
3640 if (g_y_ntt != NULL)
3641 {
3642 mpres_get_z (mt, r_y, modulus);
3643 mpzspv_from_mpzv (g_y_ntt, offset + i, &mt, 1UL, ntt_context);
3644 }
3645 }
3646
3647 /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
3648 mpres_mul (r_x, v[1 - i % 2], v2, modulus);
3649 mpres_sub (v[i % 2], r_x, v[i % 2], modulus);
3650 if (want_output && test_verbose (OUTPUT_TRACE))
3651 {
3652 mpz_t t;
3653 mpz_init (t);
3654 mpres_get_z (t, v[i % 2], modulus);
3655 outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ "
3656 "addrec(x_0 * r^(2*(M-%lu) - 1)) == %Zd /* PARI C */\n",
3657 i, t);
3658 mpz_clear (t);
3659 }
3660 }
3661
3662 mpres_clear (r_x, modulus);
3663 mpres_clear (r_y, modulus);
3664 mpres_clear (x0_x, modulus);
3665 mpres_clear (x0_y, modulus);
3666 mpres_clear (v2, modulus);
3667 for (i = 0; i < 2; i++)
3668 {
3669 mpres_clear (r1_x[i], modulus);
3670 mpres_clear (r1_y[i], modulus);
3671 mpres_clear (r2_x[i], modulus);
3672 mpres_clear (r2_y[i], modulus);
3673 mpres_clear (v[i], modulus);
3674 }
3675 for (i = 0; i < tmplen; i++)
3676 mpres_clear (tmp[i], modulus);
3677 mpz_clear (mt);
3678 mpmod_clear (modulus);
3679 #ifdef _OPENMP
3680 }
3681 #endif
3682
3683 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
3684
3685 if (g_x != NULL && g_y != NULL && test_verbose(OUTPUT_TRACE))
3686 {
3687 for (i = 0; i < l_param; i++)
3688 {
3689 outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ g_%lu = "
3690 "x_0^(M-%lu) * r^((M-%lu)^2); /* PARI */", i, i, i);
3691 outputf (OUTPUT_TRACE, "/* pp1_sequence_g */ g_%lu == "
3692 "%Zd + %Zd*w /* PARI C */\n", i, g_x[i], g_y[i]);
3693 }
3694 }
3695 }
3696
3697
3698 /* Compute r[i] = b1^(-P*(k+i)^2) * f_i for i = 0, 1, ..., l-1, where "b1" is
3699 an element of norm 1 in the quadratic extension ring */
3700
3701 static void
pp1_sequence_h(listz_t h_x,listz_t h_y,mpzspv_t h_x_ntt,mpzspv_t h_y_ntt,const listz_t f,const mpres_t b1_x,const mpres_t b1_y,const long k_param,const unsigned long l_param,const unsigned long P,const mpres_t Delta,mpmod_t modulus_param,const mpzspm_t ntt_context)3702 pp1_sequence_h (listz_t h_x, listz_t h_y, mpzspv_t h_x_ntt, mpzspv_t h_y_ntt,
3703 const listz_t f, const mpres_t b1_x, const mpres_t b1_y,
3704 const long k_param, const unsigned long l_param,
3705 const unsigned long P, const mpres_t Delta,
3706 mpmod_t modulus_param, const mpzspm_t ntt_context)
3707 {
3708 unsigned long i;
3709 long timestart, realstart;
3710
3711 if (l_param == 0UL)
3712 return;
3713
3714 ASSERT (f != h_x);
3715 ASSERT (f != h_y);
3716
3717 outputf (OUTPUT_VERBOSE, "Computing h_x and h_y");
3718 timestart = cputime ();
3719 realstart = realtime ();
3720
3721 if (test_verbose (OUTPUT_TRACE))
3722 {
3723 mpz_t t;
3724 mpz_init (t);
3725 mpres_get_z (t, Delta, modulus_param);
3726 outputf (OUTPUT_TRACE, "\n/* pp1_sequence_h */ N = %Zd; "
3727 "Delta = %Zd; w = quadgen (4*Delta); k = %ld; P = %lu; "
3728 "/* PARI */\n", modulus_param->orig_modulus, t, k_param, P);
3729 outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ b_1 = ");
3730 gfp_ext_print (b1_x, b1_y, modulus_param, OUTPUT_TRACE);
3731 outputf (OUTPUT_TRACE, "; r = b_1^P; rn = b_1^(-P); /* PARI */\n");
3732 for (i = 0; i < l_param; i++)
3733 outputf (OUTPUT_TRACE,
3734 "/* pp1_sequence_h */ f_%lu = %Zd; /* PARI */\n", i, f[i]);
3735 mpz_clear (t);
3736 }
3737
3738 #ifdef _OPENMP
3739 #pragma omp parallel if (l_param > 100) private(i)
3740 #endif
3741 {
3742 const size_t tmplen = 2;
3743 mpres_t s_x[3], s_y[3], s2_x[2], s2_y[2], v[2], V2, rn_x, rn_y,
3744 tmp[2];
3745 mpmod_t modulus; /* Thread-local copy of modulus_param */
3746 mpz_t mt;
3747 unsigned long l = l_param, offset = 0;
3748 long k = k_param;
3749
3750 #ifdef _OPENMP
3751 /* When multi-threading, we adjust the parameters for each thread */
3752
3753 const int nr_chunks = omp_get_num_threads();
3754 const int thread_nr = omp_get_thread_num();
3755
3756 l = (l_param - 1) / nr_chunks + 1;
3757 offset = thread_nr * l;
3758 if (offset <= l_param)
3759 l = MIN(l, l_param - offset);
3760 else
3761 l = 0;
3762
3763 if (thread_nr == 0)
3764 outputf (OUTPUT_VERBOSE, " using %d threads", nr_chunks);
3765 outputf (OUTPUT_TRACE, "\n");
3766 #endif
3767
3768 /* Each thread computes r[i + offset] = b1^(-P*(k+i+offset)^2) * f_i
3769 for i = 0, 1, ..., l-1, where l is the adjusted length of each thread */
3770
3771 /* Test that k+offset does not overflow */
3772 ASSERT_ALWAYS (offset <= (unsigned long) LONG_MAX &&
3773 k <= LONG_MAX - (long) offset);
3774 k += (long) offset;
3775
3776 mpz_init (mt);
3777 /* Make thread-local copy of modulus */
3778 mpmod_init_set (modulus, modulus_param);
3779
3780 /* Init the local mpres_t variables */
3781 for (i = 0; i < 2; i++)
3782 {
3783 mpres_init (s_x[i], modulus);
3784 mpres_init (s_y[i], modulus);
3785 mpres_init (s2_x[i], modulus);
3786 mpres_init (s2_y[i], modulus);
3787 mpres_init (v[i], modulus);
3788 }
3789 mpres_init (s_x[2], modulus);
3790 mpres_init (s_y[2], modulus);
3791 mpres_init (V2, modulus);
3792 mpres_init (rn_x, modulus);
3793 mpres_init (rn_y, modulus);
3794 for (i = 0; i < (unsigned long) tmplen; i++)
3795 mpres_init (tmp[i], modulus);
3796
3797 /* Compute rn = b_1^{-P}. It has the same value for all threads,
3798 but we make thread local copies anyway. */
3799 gfp_ext_pow_norm1_sl (rn_x, rn_y, b1_x, b1_y, P, Delta, modulus, tmplen,
3800 tmp);
3801 mpres_neg (rn_y, rn_y, modulus);
3802
3803 /* Compute s[0] = rn^(k^2) = r^(-k^2). We do it by two
3804 exponentiations by k and use v[0] and v[1] as temp storage */
3805 gfp_ext_pow_norm1_sl (v[0], v[1], rn_x, rn_y, k, Delta, modulus,
3806 tmplen, tmp);
3807 gfp_ext_pow_norm1_sl (s_x[0], s_y[0], v[0], v[1], k, Delta, modulus,
3808 tmplen, tmp);
3809 if (test_verbose (OUTPUT_TRACE))
3810 {
3811 #ifdef _OPENMP
3812 #pragma omp critical
3813 #endif
3814 {
3815 outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2) == ", k);
3816 gfp_ext_print (s_x[0], s_y[0], modulus, OUTPUT_TRACE);
3817 outputf (OUTPUT_TRACE, " /* PARI C */\n");
3818 }
3819 }
3820
3821 /* Compute s[1] = r^(-(k+1)^2) = r^(-(k^2 + 2k + 1))*/
3822 if (l > 1)
3823 {
3824 /* v[0] + v[1]*sqrt(Delta) still contains rn^k */
3825 gfp_ext_sqr_norm1 (s_x[1], s_y[1], v[0], v[1], modulus);
3826 /* Now s[1] = r^(-2k) */
3827 gfp_ext_mul (s_x[1], s_y[1], s_x[1], s_y[1], s_x[0], s_y[0], Delta,
3828 modulus, tmplen, tmp);
3829 /* Now s[1] = r^(-(k^2 + 2k)) */
3830 gfp_ext_mul (s_x[1], s_y[1], s_x[1], s_y[1], rn_x, rn_y, Delta,
3831 modulus, tmplen, tmp);
3832 /* Now s[1] = r^(-(k^2 + 2k + 1)) = r^(-(k+1)^2) */
3833 if (test_verbose (OUTPUT_TRACE))
3834 {
3835 #ifdef _OPENMP
3836 #pragma omp critical
3837 #endif
3838 {
3839 outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2) == ",
3840 k + 1);
3841 gfp_ext_print (s_x[1], s_y[1], modulus, OUTPUT_TRACE);
3842 outputf (OUTPUT_TRACE, " /* PARI C */\n");
3843 }
3844 }
3845 }
3846
3847 /* Compute s2[0] = r^(k^2+2) = r^(k^2) * r^2 */
3848 gfp_ext_sqr_norm1 (v[0], v[1], rn_x, rn_y, modulus);
3849 gfp_ext_mul (s2_x[0], s2_y[0], s_x[0], s_y[0], v[0], v[1], Delta, modulus,
3850 tmplen, tmp);
3851 if (test_verbose (OUTPUT_TRACE))
3852 {
3853 #ifdef _OPENMP
3854 #pragma omp critical
3855 #endif
3856 {
3857 outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2+2) == ", k);
3858 gfp_ext_print (s2_x[0], s2_y[0], modulus, OUTPUT_TRACE);
3859 outputf (OUTPUT_TRACE, " /* PARI C */\n");
3860 }
3861 }
3862
3863 /* Compute a^((k+1)^2+2) = a^((k+1)^2) * a^2 */
3864 gfp_ext_mul (s2_x[1], s2_y[1], s_x[1], s_y[1], v[0], v[1], Delta, modulus,
3865 tmplen, tmp);
3866 if (test_verbose (OUTPUT_TRACE))
3867 {
3868 #ifdef _OPENMP
3869 #pragma omp critical
3870 #endif
3871 {
3872 outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ rn^(%ld^2+2) == ",
3873 k + 1);
3874 gfp_ext_print (s2_x[1], s2_y[1], modulus, OUTPUT_TRACE);
3875 outputf (OUTPUT_TRACE, " /* PARI C */\n");
3876 }
3877 }
3878
3879 /* Compute V_2(r + 1/r). Since 1/r = rn_x - rn_y, we have r+1/r = 2*rn_x.
3880 V_2(x) = x^2 - 2, so we want 4*rn_x^2 - 2. */
3881 mpres_add (V2, rn_x, rn_x, modulus); /* V2 = r + 1/r = 2*rn_x */
3882 V (v[0], V2, 2 * k + 1, modulus); /* v[0] = V_{2k+1} (r + 1/r) */
3883 V (v[1], V2, 2 * k + 3, modulus); /* v[1] = V_{2k+3} (r + 1/r) */
3884 mpres_sqr (V2, V2, modulus); /* V2 = 4*a_x^2 */
3885 mpres_sub_ui (V2, V2, 2UL, modulus); /* V2 = 4*a_x^2 - 2 */
3886 if (test_verbose (OUTPUT_TRACE))
3887 {
3888 #ifdef _OPENMP
3889 #pragma omp critical
3890 #endif
3891 {
3892 mpres_get_z (mt, V2, modulus);
3893 outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^2 + 1/r^2 == %Zd "
3894 "/* PARI C */\n", mt);
3895 mpres_get_z (mt, v[0], modulus);
3896 outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^(2*%ld+1) + "
3897 "1/r^(2*%ld+1) == %Zd /* PARI C */\n", k, k, mt);
3898 mpres_get_z (mt, v[1], modulus);
3899 outputf (OUTPUT_TRACE, "/* pp1_sequence_h */ r^(2*%ld+3) + "
3900 "1/r^(2*%ld+3) == %Zd /* PARI C */\n", k, k, mt);
3901 }
3902 }
3903
3904 for (i = 0; i < 2UL && i < l; i++)
3905 {
3906 /* Multiply the 2nd coordinate by Delta, so that after the polynomial
3907 multipoint evaluation we get x1 + Delta*x2 */
3908 mpres_mul (s_y[i], s_y[i], Delta, modulus);
3909 mpres_mul (s2_y[i], s2_y[i], Delta, modulus);
3910
3911 if (h_x != NULL)
3912 mpres_mul_z_to_z (h_x[i + offset], s_x[i], f[i + offset], modulus);
3913 if (h_y != NULL)
3914 mpres_mul_z_to_z (h_y[i + offset], s_y[i], f[i + offset], modulus);
3915 if (h_x_ntt != NULL)
3916 {
3917 mpres_mul_z_to_z (mt, s_x[i], f[i + offset], modulus);
3918 mpzspv_from_mpzv (h_x_ntt, i + offset, &mt, 1UL, ntt_context);
3919 }
3920 if (h_y_ntt != NULL)
3921 {
3922 mpres_mul_z_to_z (mt, s_y[i], f[i + offset], modulus);
3923 mpzspv_from_mpzv (h_y_ntt, i + offset, &mt, 1UL, ntt_context);
3924 }
3925 }
3926
3927 /* Compute the remaining r^((k+i)^2) values according to Peter's
3928 recurrence */
3929
3930 for (i = 2; i < l; i++)
3931 {
3932 if (h_x != NULL || h_x_ntt != NULL)
3933 {
3934 /* r[i] = r2[i-1] * v[i-2] - r2[i-2], with indices of r2 and i
3935 taken modulo 2 */
3936 mpres_mul (s_x[i % 3], s2_x[1 - i % 2], v[i % 2], modulus);
3937 mpres_sub (s_x[i % 3], s_x[i % 3], s2_x[i % 2], modulus);
3938
3939 /* r2[i] = r2[i-1] * v[i-1] - r[i-2] */
3940 mpres_mul (s2_x[i % 2], s2_x[1 - i % 2], v[1 - i % 2], modulus);
3941 mpres_sub (s2_x[i % 2], s2_x[i % 2], s_x[(i - 2) % 3], modulus);
3942 if (h_x != NULL)
3943 mpres_mul_z_to_z (h_x[i + offset], s_x[i % 3], f[i + offset],
3944 modulus);
3945 if (h_x_ntt != NULL)
3946 {
3947 mpres_mul_z_to_z (mt, s_x[i % 3], f[i + offset], modulus);
3948 mpzspv_from_mpzv (h_x_ntt, i + offset, &mt, 1UL, ntt_context);
3949 }
3950 }
3951
3952 if (h_y != NULL || h_y_ntt != NULL)
3953 {
3954 /* Same for y coordinate */
3955 mpres_mul (s_y[i % 3], s2_y[1 - i % 2], v[i % 2], modulus);
3956 mpres_sub (s_y[i % 3], s_y[i % 3], s2_y[i % 2], modulus);
3957 mpres_mul (s2_y[i % 2], s2_y[1 - i % 2], v[1 - i % 2], modulus);
3958 mpres_sub (s2_y[i % 2], s2_y[i % 2], s_y[(i - 2) % 3], modulus);
3959 if (h_y != NULL)
3960 mpres_mul_z_to_z (h_y[i + offset], s_y[i % 3], f[i + offset],
3961 modulus);
3962 if (h_y_ntt != NULL)
3963 {
3964 mpres_mul_z_to_z (mt, s_y[i % 3], f[i + offset], modulus);
3965 mpzspv_from_mpzv (h_y_ntt, i + offset, &mt, 1UL, ntt_context);
3966 }
3967 }
3968
3969 /* v[i] = v[i - 1] * V_2(a + 1/a) - v[i - 2] */
3970 mpres_mul (tmp[0], v[1 - i % 2], V2, modulus);
3971 mpres_sub (v[i % 2], tmp[0], v[i % 2], modulus);
3972 }
3973
3974 /* Clear the local mpres_t variables */
3975 for (i = 0; i < 2; i++)
3976 {
3977 mpres_clear (s_x[i], modulus);
3978 mpres_clear (s_y[i], modulus);
3979 mpres_clear (s2_x[i], modulus);
3980 mpres_clear (s2_y[i], modulus);
3981 mpres_clear (v[i], modulus);
3982 }
3983 mpres_clear (s_x[2], modulus);
3984 mpres_clear (s_y[2], modulus);
3985 mpres_clear (V2, modulus);
3986 mpres_clear (rn_x, modulus);
3987 mpres_clear (rn_y, modulus);
3988 for (i = 0; i < tmplen; i++)
3989 mpres_clear (tmp[i], modulus);
3990
3991 /* Clear the thread-local copy of modulus */
3992 mpmod_clear (modulus);
3993
3994 mpz_clear (mt);
3995 }
3996
3997 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
3998
3999 if (h_x != NULL && h_y != NULL && test_verbose (OUTPUT_TRACE))
4000 {
4001 for (i = 0; i < l_param; i++)
4002 gmp_printf ("/* pp1_sequence_h */ (rn^((k+%lu)^2) * f_%lu) == "
4003 "(%Zd + Mod(%Zd / Delta, N) * w) /* PARI C */\n",
4004 i, i, h_x[i], h_y[i]);
4005 }
4006 }
4007
4008
4009 int
pp1fs2(mpz_t f,const mpres_t X,mpmod_t modulus,const faststage2_param_t * params)4010 pp1fs2 (mpz_t f, const mpres_t X, mpmod_t modulus,
4011 const faststage2_param_t *params)
4012 {
4013 unsigned long nr;
4014 unsigned long i, l, lenF, lenH, lenG, lenR, tmplen;
4015 sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
4016 progressions of prime length */
4017 set_long_t *S_2; /* This is stored as a regular set */
4018 listz_t F; /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
4019 degree s_1. It is symmetric, so has only s_1 / 2 + 1
4020 distinct coefficients. The sequence h_j will be stored in
4021 the same memory and won't be a monic polynomial, so the
4022 leading 1 monomial of F will be stored explicitly. Hence we
4023 need s_1 / 2 + 1 entries. */
4024
4025 listz_t g_x, g_y, fh_x, fh_y, h_x, h_y, tmp, R_x, R_y;
4026 const unsigned long tmpreslen = 2UL;
4027 mpres_t b1_x, b1_y, Delta, tmpres[2];
4028 mpz_t mt; /* All-purpose temp mpz_t */
4029 int youpi = ECM_NO_FACTOR_FOUND;
4030 long timetotalstart, realtotalstart, timestart;
4031
4032 timetotalstart = cputime ();
4033 realtotalstart = realtime ();
4034
4035 ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
4036 ASSERT_ALWAYS (params->s_1 < params->l);
4037 nr = params->l - params->s_1; /* Number of points we evaluate */
4038
4039 if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
4040 return ECM_ERROR;
4041
4042 /* Allocate all the memory we'll need */
4043 /* Allocate the correct amount of space for each mpz_t or the
4044 reallocations will up to double the time for stage 2! */
4045 mpz_init (mt);
4046 mpres_init (b1_x, modulus);
4047 mpres_init (b1_y, modulus);
4048 mpres_init (Delta, modulus);
4049 for (i = 0; i < tmpreslen; i++)
4050 mpres_init (tmpres[i], modulus);
4051 lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
4052 the leading 1 monomial for each factor */
4053 lenH = params->s_1 + 1;
4054 lenG = params->l;
4055 lenR = nr;
4056 F = init_list2 (lenF, (unsigned int) abs (modulus->bits));
4057 fh_x = init_list2 (lenF, (unsigned int) abs (modulus->bits));
4058 fh_y = init_list2 (lenF, (unsigned int) abs (modulus->bits));
4059 h_x = malloc (lenH * sizeof (mpz_t));
4060 h_y = malloc (lenH * sizeof (mpz_t));
4061 if (h_x == NULL || h_y == NULL)
4062 {
4063 fprintf (stderr, "Cannot allocate memory in pp1fs2\n");
4064 exit (1);
4065 }
4066 g_x = init_list2 (lenG, (unsigned int) abs (modulus->bits));
4067 g_y = init_list2 (lenG, (unsigned int) abs (modulus->bits));
4068 R_x = init_list2 (lenR, (unsigned int) abs (modulus->bits));
4069 R_y = init_list2 (lenR, (unsigned int) abs (modulus->bits));
4070 tmplen = 3UL * params->l + list_mul_mem (params->l / 2) + 20;
4071 outputf (OUTPUT_DEVVERBOSE, "tmplen = %lu\n", tmplen);
4072 if (TMulGen_space (params->l - 1, params->s_1, lenR) + 12 > tmplen)
4073 {
4074 tmplen = TMulGen_space (params->l - 1, params->s_1 - 1, lenR) + 12;
4075 /* FIXME: It appears TMulGen_space() returns a too small value! */
4076 outputf (OUTPUT_DEVVERBOSE, "With TMulGen_space, tmplen = %lu\n",
4077 tmplen);
4078 }
4079
4080 tmp = init_list2 (tmplen, (unsigned int) abs (modulus->bits));
4081
4082 if (test_verbose (OUTPUT_TRACE))
4083 {
4084 mpres_get_z (mt, X, modulus); /* mpz_t copy of X for printing */
4085 outputf (OUTPUT_TRACE,
4086 "N = %Zd; X = Mod(%Zd, N); /* PARI */\n",
4087 modulus->orig_modulus, mt);
4088 }
4089
4090 /* Compute the polynomial f(x) = \prod_{k_1 in S_1} (x - X^{2 k_1}) */
4091 outputf (OUTPUT_VERBOSE, "Computing F from factored S_1");
4092
4093 timestart = cputime ();
4094 i = poly_from_sets_V (F, X, S_1, tmp, tmplen, modulus, NULL, NULL);
4095 ASSERT_ALWAYS(2 * i == params->s_1);
4096 ASSERT(mpz_cmp_ui (F[i], 1UL) == 0);
4097 free (S_1);
4098 S_1 = NULL;
4099
4100 outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
4101 if (test_verbose (OUTPUT_TRACE))
4102 {
4103 for (i = 0; i < params->s_1 / 2 + 1; i++)
4104 outputf (OUTPUT_TRACE, "f_%lu = %Zd; /* PARI */\n", i, F[i]);
4105 outputf (OUTPUT_TRACE, "f(x) = f_0");
4106 for (i = 1; i < params->s_1 / 2 + 1; i++)
4107 outputf (OUTPUT_TRACE, "+ f_%lu * (x^%lu + x^(-%lu))", i, i, i);
4108 outputf (OUTPUT_TRACE, "/* PARI */ \n");
4109 }
4110
4111 /* Compute Delta and b1_x + b1_y * sqrt(Delta) = X) */
4112 mpres_sqr (Delta, X, modulus);
4113 mpres_sub_ui (Delta, Delta, 4UL, modulus);
4114 mpres_div_2exp (b1_x, X, 1, modulus);
4115 mpres_set_ui (b1_y, 1UL, modulus);
4116 mpres_div_2exp (b1_y, b1_y, 1, modulus);
4117 if (test_verbose (OUTPUT_TRACE))
4118 {
4119 mpres_get_z (mt, Delta, modulus);
4120 outputf (OUTPUT_TRACE,
4121 "Delta = Mod(%Zd, N); w = quadgen (4*lift(Delta)); b_1 = ", mt);
4122 gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
4123 outputf (OUTPUT_TRACE, "; /* PARI */\n");
4124 outputf (OUTPUT_TRACE, "X == b_1 + 1/b_1 /* PARI C */\n");
4125 }
4126
4127 /* Compute the h sequence h_j = b1^(P*-j^2) * f_j for 0 <= j <= s_1 */
4128 pp1_sequence_h (fh_x, fh_y, NULL, NULL, F, b1_x, b1_y, 0L,
4129 params->s_1 / 2 + 1, params->P, Delta, modulus, NULL);
4130 /* We don't need F(x) any more */
4131 clear_list (F, lenF);
4132
4133 /* Make a symmetric copy of fh in h. */
4134 for (i = 0; i < params->s_1 / 2 + 1; i++)
4135 {
4136 *(h_x[i]) = *(fh_x[params->s_1 / 2 - i]); /* Clone the mpz_t */
4137 *(h_y[i]) = *(fh_y[params->s_1 / 2 - i]);
4138 }
4139 for (i = 0; i < params->s_1 / 2; i++)
4140 {
4141 *(h_x[i + params->s_1 / 2 + 1]) = *(fh_x[i + 1]);
4142 *(h_y[i + params->s_1 / 2 + 1]) = *(fh_y[i + 1]);
4143 }
4144 if (test_verbose (OUTPUT_TRACE))
4145 {
4146 for (i = 0; i < params->s_1 + 1; i++)
4147 outputf (OUTPUT_VERBOSE, "h_%lu = %Zd + %Zd * w; /* PARI */\n",
4148 i, h_x[i], h_y[i]);
4149 }
4150
4151 for (l = 0; l < params->s_2; l++)
4152 {
4153 const long M = params->l - 1 - params->s_1 / 2;
4154 outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
4155 l + 1, params->s_2);
4156 pp1_sequence_g (g_x, g_y, NULL, NULL, b1_x, b1_y, params->P,
4157 Delta, M, params->l, params->m_1, S_2->elem[l],
4158 modulus, NULL);
4159
4160 /* Do the two convolution products */
4161 outputf (OUTPUT_VERBOSE, "TMulGen of g_x and h_x");
4162 timestart = cputime ();
4163 if (TMulGen (R_x, nr - 1, h_x, params->s_1, g_x, params->l - 1, tmp,
4164 modulus->orig_modulus) < 0)
4165 {
4166 outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
4167 "of memory)\n");
4168 youpi = ECM_ERROR;
4169 break;
4170 }
4171 outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
4172 outputf (OUTPUT_VERBOSE, "TMulGen of g_y and h_y");
4173 timestart = cputime ();
4174 if (TMulGen (R_y, nr - 1, h_y, params->s_1, g_y, params->l - 1, tmp,
4175 modulus->orig_modulus) < 0)
4176 {
4177 outputf (OUTPUT_ERROR, "TMulGen returned error code (probably out "
4178 "of memory)\n");
4179 youpi = ECM_ERROR;
4180 break;
4181 }
4182 outputf (OUTPUT_VERBOSE, " took %lums\n", cputime () - timestart);
4183 for (i = 0; i < nr; i++)
4184 mpz_add (R_x[i], R_x[i], R_y[i]);
4185
4186 timestart = cputime ();
4187 mpres_set_ui (tmpres[1], 1UL, modulus); /* Accumulate product in
4188 tmpres[1] */
4189 for (i = 0; i < nr; i++)
4190 {
4191 mpres_set_z_for_gcd (tmpres[0], R_x[i], modulus);
4192 #define TEST_ZERO_RESULT
4193 #ifdef TEST_ZERO_RESULT
4194 if (mpres_is_zero (tmpres[0], modulus))
4195 outputf (OUTPUT_VERBOSE, "R_[%lu] = 0\n", i);
4196 #endif
4197 mpres_mul (tmpres[1], tmpres[1], tmpres[0], modulus);
4198 }
4199 outputf (OUTPUT_VERBOSE, "Computing product of F(g_i)^(1) took %lums\n",
4200 cputime () - timestart);
4201 if (test_verbose(OUTPUT_RESVERBOSE))
4202 {
4203 mpres_get_z (mt, tmpres[1], modulus);
4204 outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", mt);
4205 }
4206
4207 mpres_gcd (mt, tmpres[1], modulus);
4208 if (mpz_cmp_ui (mt, 1UL) > 0)
4209 {
4210 mpz_set (f, mt);
4211 youpi = ECM_FACTOR_FOUND_STEP2;
4212 break;
4213 }
4214 }
4215
4216 mpz_clear (mt);
4217 mpres_clear (b1_x, modulus);
4218 mpres_clear (b1_y, modulus);
4219 mpres_clear (Delta, modulus);
4220 for (i = 0; i < tmpreslen; i++)
4221 mpres_clear (tmpres[i], modulus);
4222 clear_list (fh_x, lenF);
4223 clear_list (fh_y, lenF);
4224 free (h_x);
4225 free (h_y);
4226 clear_list (g_x, lenG);
4227 clear_list (g_y, lenG);
4228 clear_list (R_x, lenR);
4229 clear_list (R_y, lenR);
4230 clear_list (tmp, tmplen);
4231 free (S_2);
4232
4233 outputf (OUTPUT_NORMAL, "Step 2");
4234 /* In normal output mode, print only cpu time as we always have.
4235 In verbose mode, print real time as well if we used multi-threading */
4236 if (test_verbose (OUTPUT_VERBOSE))
4237 print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
4238 else
4239 print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
4240
4241 return youpi;
4242 }
4243
4244
4245 int
pp1fs2_ntt(mpz_t f,const mpres_t X,mpmod_t modulus,const faststage2_param_t * params,const int twopass)4246 pp1fs2_ntt (mpz_t f, const mpres_t X, mpmod_t modulus,
4247 const faststage2_param_t *params, const int twopass)
4248 {
4249 unsigned long nr;
4250 unsigned long l, lenF;
4251 sets_long_t *S_1; /* This is stored as a set of sets (arithmetic
4252 progressions of prime length */
4253 set_long_t *S_2; /* This is stored as a regular set */
4254 listz_t F; /* Polynomial F has roots X^{k_1} for k_1 \in S_1, so has
4255 degree s_1. It is symmetric, so has only s_1 / 2 + 1
4256 distinct coefficients. The sequence h_j will be stored in
4257 the same memory and won't be a monic polynomial, so the
4258 leading 1 monomial of F will be stored explicitly. Hence we
4259 need s_1 / 2 + 1 entries. */
4260 listz_t R = NULL; /* Is used only for two-pass convolution, has nr
4261 entries. R is only ever referenced if twopass == 1,
4262 but gcc does not realize that and complains about
4263 uninitialized value, so we set it to NULL. */
4264 mpzspm_t ntt_context;
4265 mpzspv_t g_x_ntt, g_y_ntt, h_x_ntt, h_y_ntt;
4266 mpres_t b1_x, b1_y, Delta;
4267 mpz_t mt; /* All-purpose temp mpz_t */
4268 mpz_t product;
4269 mpz_t *product_ptr = NULL;
4270 int youpi = ECM_NO_FACTOR_FOUND;
4271 long timetotalstart, realtotalstart, timestart, realstart;
4272
4273 timetotalstart = cputime ();
4274 realtotalstart = realtime ();
4275
4276 ASSERT_ALWAYS (eulerphi (params->P) == params->s_1 * params->s_2);
4277 ASSERT_ALWAYS (params->s_1 < params->l);
4278 nr = params->l - params->s_1; /* Number of points we evaluate */
4279
4280 if (make_S_1_S_2 (&S_1, &S_2, params) == ECM_ERROR)
4281 return ECM_ERROR;
4282
4283 mpz_init (mt);
4284
4285 /* Prepare NTT for computing the h sequence, its DCT-I, and the convolution
4286 with g. We need NTT of transform length l here. If we want to add
4287 transformed vectors, we need to double the modulus. */
4288
4289 if (twopass)
4290 mpz_set (mt, modulus->orig_modulus);
4291 else
4292 mpz_mul_2exp (mt, modulus->orig_modulus, 1UL);
4293
4294 ntt_context = mpzspm_init (params->l, mt);
4295
4296 if (ntt_context == NULL)
4297 {
4298 outputf (OUTPUT_ERROR, "Could not initialise ntt_context, "
4299 "presumably out of memory\n");
4300 mpz_clear (mt);
4301 free (S_1);
4302 S_1 = NULL;
4303 free (S_2);
4304 S_2 = NULL;
4305 return ECM_ERROR;
4306 }
4307
4308 print_CRT_primes (OUTPUT_DEVVERBOSE, "CRT modulus for evaluation = ",
4309 ntt_context);
4310
4311 /* Allocate memory for F with correct amount of space for each mpz_t */
4312 lenF = params->s_1 / 2 + 1 + 1; /* Another +1 because poly_from_sets_V stores
4313 the leading 1 monomial for each factor */
4314 {
4315 /* in some cases the above value of lenF is not enough, for example with
4316 s_1 = 10, which gives lenF = 7, but 9 entries are needed */
4317 unsigned long mem = mem_poly_from_sets_V (S_1);
4318 if (mem > lenF)
4319 lenF = mem;
4320 }
4321
4322 F = init_list2 (lenF, (unsigned int) abs (modulus->bits) + GMP_NUMB_BITS);
4323
4324 /* Build F */
4325 if (build_F_ntt (F, X, S_1, params, modulus) == ECM_ERROR)
4326 {
4327 free (S_1);
4328 free (S_2);
4329 mpz_clear (mt);
4330 mpzspm_clear (ntt_context);
4331 clear_list (F, lenF);
4332 return ECM_ERROR;
4333 }
4334
4335 free (S_1);
4336 S_1 = NULL;
4337
4338 mpres_init (b1_x, modulus);
4339 mpres_init (b1_y, modulus);
4340 mpres_init (Delta, modulus);
4341
4342 /* Compute Delta and b1_x + b1_y * sqrt(Delta) = X) */
4343 mpres_sqr (Delta, X, modulus);
4344 mpres_sub_ui (Delta, Delta, 4UL, modulus);
4345 mpres_div_2exp (b1_x, X, 1, modulus);
4346 mpres_set_ui (b1_y, 1UL, modulus);
4347 mpres_div_2exp (b1_y, b1_y, 1, modulus);
4348 if (test_verbose (OUTPUT_TRACE))
4349 {
4350 mpres_get_z (mt, Delta, modulus);
4351 outputf (OUTPUT_TRACE,
4352 "Delta = Mod(%Zd, N); w = quadgen (4*lift(Delta)); b_1 = ", mt);
4353 gfp_ext_print (b1_x, b1_y, modulus, OUTPUT_TRACE);
4354 outputf (OUTPUT_TRACE, "; /* PARI */\n");
4355 outputf (OUTPUT_TRACE, "X == b_1 + 1/b_1 /* PARI C */\n");
4356 }
4357
4358 /* Allocate remaining memory for h_ntt */
4359 h_x_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
4360 h_y_ntt = mpzspv_init (params->l / 2 + 1, ntt_context);
4361 /* Compute the h_j sequence */
4362 pp1_sequence_h (NULL, NULL, h_x_ntt, h_y_ntt, F, b1_x, b1_y, 0L,
4363 params->s_1 / 2 + 1, params->P, Delta, modulus,
4364 ntt_context);
4365 /* We don't need F(x) any more */
4366 clear_list (F, lenF);
4367
4368 /* compute the forward transform of h and store the distinct coefficients
4369 in h_ntt */
4370 g_x_ntt = mpzspv_init (params->l, ntt_context);
4371 if (twopass)
4372 {
4373 g_y_ntt = g_x_ntt;
4374 R = init_list2 (nr, (mpz_size (modulus->orig_modulus) + 2) *
4375 GMP_NUMB_BITS);
4376 }
4377 else
4378 g_y_ntt = mpzspv_init (params->l, ntt_context);
4379
4380 /* Compute DCT-I of h_x and h_y */
4381 outputf (OUTPUT_VERBOSE, "Computing DCT-I of h_x");
4382 #ifdef _OPENMP
4383 outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4384 #endif
4385 timestart = cputime ();
4386 realstart = realtime ();
4387 mpzspv_to_dct1 (h_x_ntt, h_x_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
4388 g_x_ntt, ntt_context);
4389 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4390
4391 outputf (OUTPUT_VERBOSE, "Computing DCT-I of h_y");
4392 #ifdef _OPENMP
4393 outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4394 #endif
4395 timestart = cputime ();
4396 realstart = realtime ();
4397 mpzspv_to_dct1 (h_y_ntt, h_y_ntt, params->s_1 / 2 + 1, params->l / 2 + 1,
4398 g_x_ntt, ntt_context);
4399 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4400
4401 if (test_verbose (OUTPUT_RESVERBOSE))
4402 {
4403 mpz_init (product);
4404 product_ptr = &product;
4405 }
4406
4407 for (l = 0; l < params->s_2; l++)
4408 {
4409 const long M = params->l - 1 - params->s_1 / 2;
4410
4411 outputf (OUTPUT_VERBOSE, "Multi-point evaluation %lu of %lu:\n",
4412 l + 1, params->s_2);
4413 if (twopass)
4414 {
4415 /* Two-pass variant. Two separate convolutions,
4416 then addition in Z/NZ */
4417 pp1_sequence_g (NULL, NULL, g_x_ntt, NULL, b1_x, b1_y, params->P,
4418 Delta, M, params->l, params->m_1, S_2->elem[l],
4419 modulus, ntt_context);
4420
4421 /* Do the convolution product of g_x * h_x */
4422 outputf (OUTPUT_VERBOSE, "Computing g_x*h_x");
4423 #ifdef _OPENMP
4424 outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4425 #endif
4426 timestart = cputime ();
4427 realstart = realtime ();
4428 mpzspv_mul_by_dct (g_x_ntt, h_x_ntt, params->l, ntt_context,
4429 NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
4430 /* Store the product coefficients we want in R */
4431 mpzspv_to_mpzv (g_x_ntt, params->s_1 / 2, R, nr, ntt_context);
4432 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4433
4434 /* Compute g_y sequence */
4435 pp1_sequence_g (NULL, NULL, NULL, g_y_ntt, b1_x, b1_y, params->P,
4436 Delta, M, params->l, params->m_1, S_2->elem[l],
4437 modulus, ntt_context);
4438
4439 /* Do the convolution product of g_y * (Delta * h_y) */
4440 outputf (OUTPUT_VERBOSE, "Computing g_y*h_y");
4441 #ifdef _OPENMP
4442 outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4443 #endif
4444 timestart = cputime ();
4445 realstart = realtime ();
4446 mpzspv_mul_by_dct (g_y_ntt, h_y_ntt, params->l, ntt_context,
4447 NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL + NTT_MUL_STEP_IFFT);
4448 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4449
4450 /* Compute product of sum of coefficients and gcd with N */
4451 ntt_gcd (mt, product_ptr, g_y_ntt, params->s_1 / 2, R, nr,
4452 ntt_context, modulus);
4453 }
4454 else
4455 {
4456 /* One-pass variant. Two forward transforms and point-wise products,
4457 then addition and single inverse transform */
4458 pp1_sequence_g (NULL, NULL, g_x_ntt, g_y_ntt, b1_x, b1_y, params->P,
4459 Delta, M, params->l, params->m_1, S_2->elem[l],
4460 modulus, ntt_context);
4461
4462 outputf (OUTPUT_VERBOSE, "Computing forward NTT of g_x");
4463 #ifdef _OPENMP
4464 outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4465 #endif
4466 timestart = cputime ();
4467 realstart = realtime ();
4468 mpzspv_mul_by_dct (g_x_ntt, h_x_ntt, params->l, ntt_context,
4469 NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL);
4470 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4471
4472 outputf (OUTPUT_VERBOSE, "Computing forward NTT of g_y");
4473 #ifdef _OPENMP
4474 outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4475 #endif
4476 timestart = cputime ();
4477 realstart = realtime ();
4478 mpzspv_mul_by_dct (g_y_ntt, h_y_ntt, params->l, ntt_context,
4479 NTT_MUL_STEP_FFT1 + NTT_MUL_STEP_MUL);
4480 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4481
4482 outputf (OUTPUT_VERBOSE, "Adding and computing inverse NTT of sum");
4483 #ifdef _OPENMP
4484 outputf (OUTPUT_VERBOSE, " using %d threads", omp_get_thread_limit());
4485 #endif
4486 timestart = cputime ();
4487 realstart = realtime ();
4488 mpzspv_add (g_x_ntt, (spv_size_t) 0, g_x_ntt, (spv_size_t) 0,
4489 g_y_ntt, (spv_size_t) 0, params->l, ntt_context);
4490 mpzspv_mul_by_dct (g_x_ntt, NULL, params->l, ntt_context,
4491 NTT_MUL_STEP_IFFT);
4492 print_elapsed_time (OUTPUT_VERBOSE, timestart, realstart);
4493
4494 ntt_gcd (mt, product_ptr, g_x_ntt, params->s_1 / 2, NULL, nr,
4495 ntt_context, modulus);
4496 }
4497
4498 outputf (OUTPUT_RESVERBOSE, "Product of R[i] = %Zd\n", product);
4499
4500 if (mpz_cmp_ui (mt, 1UL) > 0)
4501 {
4502 mpz_set (f, mt);
4503 youpi = ECM_FACTOR_FOUND_STEP2;
4504 break;
4505 }
4506 }
4507
4508 if (test_verbose (OUTPUT_RESVERBOSE))
4509 {
4510 product_ptr = NULL;
4511 mpz_clear (product);
4512 }
4513 mpzspv_clear (g_x_ntt, ntt_context);
4514 if (twopass)
4515 clear_list (R, nr);
4516 else
4517 mpzspv_clear (g_y_ntt, ntt_context);
4518 mpzspv_clear (h_x_ntt, ntt_context);
4519 mpzspv_clear (h_y_ntt, ntt_context);
4520 mpzspm_clear (ntt_context);
4521 mpz_clear (mt);
4522 mpres_clear (b1_x, modulus);
4523 mpres_clear (b1_y, modulus);
4524 mpres_clear (Delta, modulus);
4525 free (S_2);
4526
4527 outputf (OUTPUT_NORMAL, "Step 2");
4528 /* In normal output mode, print only cpu time as we always have.
4529 In verbose mode, print real time as well if we used multi-threading */
4530 if (test_verbose (OUTPUT_VERBOSE))
4531 print_elapsed_time (OUTPUT_NORMAL, timetotalstart, realtotalstart);
4532 else
4533 print_elapsed_time (OUTPUT_NORMAL, timetotalstart, 0L);
4534
4535 return youpi;
4536 }
4537