1
2 #include <NTL/FFT.h>
3 #include <NTL/FFT_impl.h>
4
5 #ifdef NTL_ENABLE_AVX_FFT
6 #include <NTL/SmartPtr.h>
7 #include <NTL/pd_FFT.h>
8 #endif
9
10
11 /********************************************************************
12
13 This is an implementation of a "small prime" FFT, which lies at the heart of
14 ZZ_pX and zz_pX arithmetic, and impacts many other applications as well
15 (such as arithmetic in ZZ_pEX, zz_pEX, and ZZX).
16
17 The algorithm is a Truncated FFT based on code originally developed by David
18 Harvey. David's code built on the single-precision modular multiplication
19 technique introduced in NTL many years ago, but also uses a "lazy
20 multiplication" technique, which reduces the number of "correction" steps that
21 need to be performed in each butterfly (see below for more details). It also
22 implements a version of the Truncated FFT algorithm introduced by Joris van der
23 Hoeven at ISSAC 2004. Also see "A cache-friendly truncated FFT", David Harvey,
24 Theoretical Computer Science Volume 410, Issues 27-29, 28 June 2009, Pages
25 2649-2658.
26
27 I have almost completely re-written David's original code to make it fit into
28 NTL's software framework; however, all of the key logic is still based on
29 David's code. David's original code also implemented a 2D transformation which
30 is more cache friendly for *very* large transforms. However, my experimens
31 indicated this was only beneficial for transforms of size at least 2^20, and so
32 I did not incorporate this variant.
33
34 Here is the Copyright notice from David's original code:
35
36
37 ==============================================================================
38
39 fft62: a library for number-theoretic transforms
40
41 Copyright (C) 2013, David Harvey
42
43 All rights reserved.
44
45 Redistribution and use in source and binary forms, with or without
46 modification, are permitted provided that the following conditions are met:
47
48 * Redistributions of source code must retain the above copyright notice, this
49 list of conditions and the following disclaimer.
50 * Redistributions in binary form must reproduce the above copyright notice,
51 this list of conditions and the following disclaimer in the documentation
52 and/or other materials provided with the distribution.
53
54 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
55 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
57 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
58 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
60 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
61 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
62 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
63 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64
65
66 ==============================================================================
67
68
69 SINGLE-PRECISION MODULAR ARITHMETIC
70
71 The implementation of arithmetic modulo n, where n is a "word sized" integer is
72 critical to the performance of the FFT. Such word-sized modular arithmetic is
73 used throughout many other parts of NTL, and is a part of the external,
74 documented interface.
75
76 As NTL was initially built on top of Arjen Lenstra's LIP software, I stole a
77 lot of ideas from LIP. One very nice ideas was LIP's way of handling
78 single-precision modular arithmetic. Back in those days (the early 1990's), I
79 was targeting 32-machines, mainly SPARC stations. LIP's stratgey was to
80 restrict n to 30 bits, and to compute a*b % n, where 0 <= a, b < n, the
81 follwong was computed:
82
83 long q = long(double(a) * double(b) / double(n));
84 long r = a*b - q*n;
85 if (r >= n)
86 r -= n;
87 else if (r < 0)
88 r += n;
89
90 With quite reasonable assumptions about floating point (certainly, anything
91 even remotely close to IEEE 64-bit doubles), the computation of q always gives
92 the true quotient floor(a*b / n), plus or minus 1. The computation of r is
93 done modulo the 2^{word size}, and the following if/then/else adjusts r as
94 necessary. To be more portable, some of these computations should really be
95 done using unsigned arithmetic, but that is not so important here. Also, the
96 adjustment steps can be replaced by simple non-branching instrictions sequences
97 involving SHIFT, AND, and ADD/SUB instructions. On some modern machines, this
98 is usually faster and NTL uses this non-branching strategy. However, on other
99 machines (modern x86's are an example of this), conditional move instructions
100 can be used in place of branching, and this code can be faster than the
101 non-branching code. NTL's performance-tuning script will figure out the best
102 way to do this.
103
104
105 Other simple optimizations can be done, such as precomputing 1/double(n) when n
106 remains fixed for many computations, as is often the case.
107
108 Note also that this strategy works perfectly well even when a or b are larger
109 than n, but the quotient itself is bounded by 2^30.
110
111 This strategy worked well for many years. I had considered employing
112 "Montgomery multiplication", but did not do so for a couple of reasons:
113 1) it would require non-portable code, because Montgomery multiplication
114 requires the computation of two-word products,
115 2) I did not like the idea of working with "alternative representations"
116 for integers mod n, as this would make the interfaces more awkward.
117
118 At some point in the early 2000's, this strategy was starting to slow things
119 down, as floating point arithmetic, especially the integer/floating point
120 conversions, was starting to slow down relative to integer arithmetic. This
121 was especially true on x86 machines, which by this time was starting to become
122 the most important target. As it happens, later in the 2000's, as the x86
123 platforms started to use SSE instructions in lieu of the old x87 FPU
124 instructions, this speed differential again became less of a problem.
125 Nevertheless, I introduced some new techniques that speed things up across a
126 variety of platforms. I introduced this new technique in NTL 5.4 back in 2005.
127 I never claimed it was particularly new, and I never really documented many
128 details about it, but since then, it has come to be known as "Shoup
129 multiplcation" in a few papers, so I'll accept that. :-) The paper "Faster
130 arithmetic for number-theoretic transforms" [David Harvey, J. Symb. Comp. 60
131 (2014)] seems to be the first place where it is discussed in detail,
132 and Harvey's paper also contains some improvements which I discuss below.
133
134 The basic idea is that in many computations, not only n, but one of the
135 arguments, say b, remains fixed for many computatations of a*b % n, and so we
136 can afford to do a little precomputation, based on b and n, to speed things up.
137 This approach does require the ability to compute double-word products
138 (actually, just the high word of the product), but it still presents the same
139 basic interface as before (i.e., no awkward, alternative representations);
140 moreover, on platforms where we can't get double-word products, the
141 implementation falls back to the old floating point strategy, and client code
142 need not be aware of this.
143
144 The basic idea is this: suppose 0 <= n < 2^w, and 0 <= a < 2^w, and 0 <= b < n.
145 We precompute bninv = floor(2^w*b/n). Then if we compute q =
146 floor(a*bninv/2^w), it can be argued that q is either floor(a*b/n), or is 1 too
147 small. The computation of bninv can be done using the floating point
148 techniques described above. The computation of q can be done by computing the
149 high word of a double-word product (it helps if bninv is left-shifted an
150 appropriate amount first). Once we have q, we can compute a*b - q*n as before,
151 and adjust (but now only one adjustment is needed). So after the
152 precomputation. the whole operation takes 3 multiplies (one doube-word and two
153 single-word), and a small handful of simple instructions (adds, shifts, etc).
154 Moreover, two of the three multiplies can start in parallel, on platforms where
155 this is possible.
156
157 David Harvey noticed that because on modern machines, multiplies are really not
158 that slow compared to additions, the cost of all of the adjustments (in the
159 MulMod, as well as in the AddMod and SubMod's in the basic FFT butterfly steps)
160 starts to dominate the cost of the FFT. Indeed, with a straightforward
161 implementation of the above ideas, there are three multiplies and three
162 adjustment steps in each butterfly step. David's idea was to work with
163 redundant representations mod n, in the range [0..4*n), and thus reduce the
164 number of adjustments per butterfly from three to one. I've implemented this
165 idea here, and it does indeed make a significant difference, which is even more
166 pronounced when all of the FFT multipliers b and corresponding bninv values are
167 precomputed. My initial implementation of David's ideas (v6.0 in 2013) only
168 implemented his approach with these precomputated tables: it seemed that
169 without these tables, it was not a significant improvement. However, I later
170 figured out how to reduce the cost of computing all the necessary data "on the
171 fly", in a way that seems only slightly (10-15%) slower overall. I introduced
172 this in v9.1 in 2015, and set things up so that now the pre-computed tables are
173 still used, but not exclusively, in such a way as to reduce the memory used by
174 these tables for very large polynomials (either very high degree or lots of FFT
175 primes). The idea here is simple, but I haven't seen it discussed elsewhere,
176 so I'll document the basic idea here.
177
178 Suppose we have the preconditioners for a and b, and want a*b % n along with
179 the preconditioner for a*b % n.
180
181 For a, let us suppose that we have both q1 and r1, where:
182 2^w*a = n*q1 + r1
183 We can obtain both q1 and r1 using floating point techniques.
184
185 Step 1. Compute a*b % n, using the integer-only MulMod, using
186 either the preconditioner for either a or b.
187
188 Step 2. Compute q2 and r2 such that
189 r1*b = n*q2 + r2
190 We can obtain these using the integer-only MulMod, preconditioned on b.
191 Actually, we only need q2, not r2.
192
193 Step 3. Compute
194 q3 = q1*b + q2 mod 2^w
195 which we can compute with just a single-word multiply and an addition.
196
197 One can easily show that the value q3 computed above is indeed the
198 preconditioner for a*b % n.
199
200 Note that, in theory, if the computation in Step 2 is done using the
201 preconditioner for a (i.e., q1), then the multiplication q1*b in Step 3 should
202 not really be necessary (assuming that computing both high and low words of a
203 doube-wprd product is no more expensive than just computing the low word).
204 However, none of the compilers I've used have been able to perform that
205 optimization (in NTL v11.1, I added code that hand-codes this optimization).
206
207
208 64-BIT MACHINES
209
210 Current versions of NTL use (by default) 60-bit moduli based
211 on all-integer arithemtic.
212
213
214 Prior to v9.0 of NTL, on 64 bits, the modulus n was restricted to 50 bits, in
215 order to allow the use of double-precision techniques, as double's have 53 bits
216 of precision. However, NTL now supports 60-bit moduli. Actually, 62 bits can
217 be supported by setting the NTL_MAXIMIZE_SP_NBITS configuraton flag, but other
218 things (namely, the TBL_REM implementation in lip.cpp) start to slow down if 62
219 bits are used, so 60 seems like a good compromise. Currently, 60-bit moduli
220 are available only when compiling NTL with GMP, and when some kind of extended
221 integer of floating point arithmetic is available.
222
223
224 FUTURE TRENDS
225
226
227 * The following papers
228
229 https://eprint.iacr.org/2017/727
230 https://eprint.iacr.org/2016/504
231 https://eprint.iacr.org/2015/382
232
233 present FFTs that access the pre-computed tables in a somewhat more efficent
234 fashion, so that we only need to read from the tables O(n) times, rather than
235 O(n log n) times.
236
237 I've partially implemented this, and have gotten mixed results.
238 For smallish FFT's (below k=10 or 11), this code is somewhat slower.
239 For larger FFT's (say, k=17), I see a speedup of 3-10%.
240
241
242 ********************************************************************/
243
244
245
246 #define NTL_FFT_BIGTAB_LIMIT (180)
247 #define NTL_FFT_BIGTAB_MAXROOT (17)
248 #define NTL_FFT_BIGTAB_MINROOT (7)
249
250 // table sizes are bounded by 2^bound, where
251 // bound = NTL_FFT_BIGTAB_MAXROOT-index/NTL_FFT_BIGTAB_LIMIT.
252 // Here, index is the index of an FFT prime, or 0 for a user FFT prime.
253 // If bound <= NTL_FFT_BIGTAB_MINROOT, then big tables are not used,
254 // so only the first
255 // (NTL_FFT_BIGTAB_MAXROOT-NTL_FFT_BIGTAB_MINROOT)*NTL_FFT_BIGTAB_LIMIT
256 // FFT primes will have big tables.
257
258 // NOTE: in newer versions of NTL (v9.1 and later), the BIGTAB
259 // code is only about 5-15% faster than the non-BIGTAB code, so
260 // this is not a great time/space trade-off.
261 // However, some futher optimizations may only be implemented
262 // if big tables are used.
263
264 // NOTE: NTL_FFT_BIGTAB_MAXROOT is set independently of the parameter
265 // NTL_FFTMaxRoot defined in FFT.h (and which is typically 25).
266 // The space for the LazyTable FFTMultipliers could be reduced a bit
267 // by using min(NTL_FFT_BIGTAB_MAXROOT, NTL_FFTMaxRoot) + 1 for the
268 // size of these tables.
269
270
271
272 NTL_START_IMPL
273
274
275
276 class FFTVectorPair {
277 public:
278 Vec<long> wtab_precomp;
279 Vec<mulmod_precon_t> wqinvtab_precomp;
280 };
281
282 typedef LazyTable<FFTVectorPair, NTL_FFTMaxRoot+1> FFTMultipliers;
283
284
285 #ifdef NTL_ENABLE_AVX_FFT
286 class pd_FFTVectorPair {
287 public:
288 AlignedArray<double> wtab_precomp;
289 AlignedArray<double> wqinvtab_precomp;
290 };
291
292 typedef LazyTable<pd_FFTVectorPair, NTL_FFTMaxRoot+1> pd_FFTMultipliers;
293 #endif
294
295
296
297 class FFTMulTabs {
298 public:
299
300 #ifndef NTL_ENABLE_AVX_FFT
301 long bound;
302 FFTMultipliers MulTab;
303 #else
304 pd_FFTMultipliers pd_MulTab[2];
305 #endif
306
307 };
308
deleter(FFTMulTabs * p)309 void FFTMulTabsDeleterPolicy::deleter(FFTMulTabs *p) { delete p; }
310
311
312
313 FFTTablesType FFTTables;
314 // a truly GLOBAL variable, shared among all threads
315
316
317
IsFFTPrime(long n,long & w)318 long IsFFTPrime(long n, long& w)
319 {
320 long m, x, y, z;
321 long j, k;
322
323
324 if (n <= 1 || n >= NTL_SP_BOUND) return 0;
325
326 if (n % 2 == 0) return 0;
327
328 if (n % 3 == 0) return 0;
329
330 if (n % 5 == 0) return 0;
331
332 if (n % 7 == 0) return 0;
333
334 m = n - 1;
335 k = 0;
336 while ((m & 1) == 0) {
337 m = m >> 1;
338 k++;
339 }
340
341 for (;;) {
342 x = RandomBnd(n);
343
344 if (x == 0) continue;
345 z = PowerMod(x, m, n);
346 if (z == 1) continue;
347
348 x = z;
349 j = 0;
350 do {
351 y = z;
352 z = MulMod(y, y, n);
353 j++;
354 } while (j != k && z != 1);
355
356 if (z != 1 || y != n-1) return 0;
357
358 if (j == k)
359 break;
360 }
361
362 /* x^{2^k} = 1 mod n, x^{2^{k-1}} = -1 mod n */
363
364 long TrialBound;
365
366 TrialBound = m >> k;
367 if (TrialBound > 0) {
368 if (!ProbPrime(n, 5)) return 0;
369
370 /* we have to do trial division by special numbers */
371
372 TrialBound = SqrRoot(TrialBound);
373
374 long a, b;
375
376 for (a = 1; a <= TrialBound; a++) {
377 b = (a << k) + 1;
378 if (n % b == 0) return 0;
379 }
380 }
381
382 /* n is an FFT prime */
383
384
385 for (j = NTL_FFTMaxRoot; j < k; j++) {
386 x = MulMod(x, x, n);
387 }
388
389 w = x;
390
391 return 1;
392 }
393
394
395 static
NextFFTPrime(long & q,long & w,long index)396 void NextFFTPrime(long& q, long& w, long index)
397 {
398 static long m = NTL_FFTMaxRootBnd + 1;
399 static long k = 0;
400 // m and k are truly GLOBAL variables, shared among
401 // all threads. Access is protected by a critical section
402 // guarding FFTTables
403
404 static long last_index = -1;
405 static long last_m = 0;
406 static long last_k = 0;
407
408 if (index == last_index) {
409 // roll back m and k...part of a simple error recovery
410 // strategy if an exception was thrown in the last
411 // invocation of UseFFTPrime...probably of academic
412 // interest only
413
414 m = last_m;
415 k = last_k;
416 }
417 else {
418 last_index = index;
419 last_m = m;
420 last_k = k;
421 }
422
423 long t, cand;
424
425 for (;;) {
426 if (k == 0) {
427 m--;
428 if (m < 5) ResourceError("ran out of FFT primes");
429 k = 1L << (NTL_SP_NBITS-m-2);
430 }
431
432 k--;
433
434 cand = (1L << (NTL_SP_NBITS-1)) + (k << (m+1)) + (1L << m) + 1;
435
436 if (!IsFFTPrime(cand, t)) continue;
437 q = cand;
438 w = t;
439 return;
440 }
441 }
442
443
CalcMaxRoot(long p)444 long CalcMaxRoot(long p)
445 {
446 p = p-1;
447 long k = 0;
448 while ((p & 1) == 0) {
449 p = p >> 1;
450 k++;
451 }
452
453 if (k > NTL_FFTMaxRoot)
454 return NTL_FFTMaxRoot;
455 else
456 return k;
457 }
458
459
460
461
462 #ifndef NTL_WIZARD_HACK
463 SmartPtr<zz_pInfoT> Build_zz_pInfo(FFTPrimeInfo *info);
464 #else
Build_zz_pInfo(FFTPrimeInfo * info)465 SmartPtr<zz_pInfoT> Build_zz_pInfo(FFTPrimeInfo *info) { return 0; }
466 #endif
467
UseFFTPrime(long index)468 void UseFFTPrime(long index)
469 {
470 if (index < 0) LogicError("invalud FFT prime index");
471 if (index >= NTL_MAX_FFTPRIMES) ResourceError("FFT prime index too large");
472
473 if (index+1 >= NTL_NSP_BOUND) ResourceError("FFT prime index too large");
474 // largely acacedemic, but it is a convenient assumption
475
476 do { // NOTE: thread safe lazy init
477 FFTTablesType::Builder bld(FFTTables, index+1);
478 long amt = bld.amt();
479 if (!amt) break;
480
481 long first = index+1-amt;
482 // initialize entries first..index
483
484 long i;
485 for (i = first; i <= index; i++) {
486 UniquePtr<FFTPrimeInfo> info;
487 info.make();
488
489 long q, w;
490 NextFFTPrime(q, w, i);
491
492 long bigtab_index = -1;
493
494 #ifdef NTL_FFT_BIGTAB
495 bigtab_index = i;
496 #endif
497
498 InitFFTPrimeInfo(*info, q, w, bigtab_index);
499 info->zz_p_context = Build_zz_pInfo(info.get());
500 bld.move(info);
501 }
502
503 } while (0);
504 }
505
506
507 #ifdef NTL_FFT_LAZYMUL
508 // we only honor the FFT_LAZYMUL flag if either the SPMM_ULL_VIABLE or LONGLONG_SP_MULMOD
509 // flags are set
510
511 #if (!defined(NTL_SPMM_ULL_VIABLE) && !defined(NTL_LONGLONG_SP_MULMOD))
512 #undef NTL_FFT_LAZYMUL
513
514 // raise an error if running the wizard
515 #if (defined(NTL_WIZARD_HACK))
516 #error "cannot honor NTL_FFT_LAZYMUL"
517 #endif
518
519 #endif
520
521 #endif
522
523
524
525
526 #ifdef NTL_FFT_LAZYMUL
527 // FFT with lazy multiplication
528
529 #ifdef NTL_CLEAN_INT
530 #define NTL_FFT_USEBUF
531 #endif
532 // DIRT: with the lazy multiplication strategy, we have to work
533 // with unisgned long's rather than long's. To avoid unnecessary
534 // copying, we simply cast long* to unsigned long*.
535 // Is this standards compliant? Does it evoke Undefined Behavior?
536 // The C++ standard before C++14 were actually somewhat inconsistent
537 // on this point.
538
539 // In all versions of the C++ and C standards, the "strict aliasing"
540 // rules [basic.lval] have always said that signed/unsigned can
541 // always alias each other. So this does not break the strict
542 // aliasing rules. However, prior to C++14, the section
543 // on Lvalue-to-rvalue conversion [conv.lval] said that
544 // this was actually UB. This has been cleared up in C++14,
545 // where now it is no longer UB. Actally, it seems that the change
546 // to C++14 was cleaning up an inconsistency in the standard
547 // itself, and not really a change in the language definition.
548
549 // In practice, it does make a significant difference in performance
550 // to avoid all these copies, so the default is avoid them.
551
552 // See: https://stackoverflow.com/questions/30048135/efficient-way-to-bit-copy-a-signed-integer-to-an-unsigned-integer
553
554 // See: https://stackoverflow.com/questions/27109701/aliasing-of-otherwise-equivalent-signed-and-unsigned-types
555 // Especially comments by Columbo regarding N3797 and [conv.lval]
556
557
558
559
560
561
562 #if (defined(NTL_LONGLONG_SP_MULMOD))
563
564
565 #if (NTL_BITS_PER_LONG >= NTL_SP_NBITS+4)
566
567 static inline unsigned long
sp_NormalizedLazyPrepMulModPreconWithRem(unsigned long & rres,long b,long n,unsigned long ninv)568 sp_NormalizedLazyPrepMulModPreconWithRem(unsigned long& rres, long b, long n, unsigned long ninv)
569 {
570 unsigned long H = cast_unsigned(b);
571 unsigned long Q = ll_mul_hi(H << 4, ninv);
572 unsigned long L = cast_unsigned(b) << (NTL_SP_NBITS+2);
573 long r = L - Q*cast_unsigned(n); // r in [0..2*n)
574
575 r = sp_CorrectExcessQuo(Q, r, n);
576 rres = r;
577 return Q; // NOTE: not shifted
578 }
579
580 static inline unsigned long
sp_NormalizedLazyPrepMulModPrecon(long b,long n,unsigned long ninv)581 sp_NormalizedLazyPrepMulModPrecon(long b, long n, unsigned long ninv)
582 {
583 unsigned long H = cast_unsigned(b);
584 unsigned long Q = ll_mul_hi(H << 4, ninv);
585 unsigned long L = cast_unsigned(b) << (NTL_SP_NBITS+2);
586 long r = L - Q*cast_unsigned(n); // r in [0..2*n)
587
588 Q += 1L + sp_SignMask(r-n);
589 return Q; // NOTE: not shifted
590 }
591
592
593 #else
594
595 // NTL_BITS_PER_LONG == NTL_SP_NBITS+2
596 static inline unsigned long
sp_NormalizedLazyPrepMulModPreconWithRem(unsigned long & rres,long b,long n,unsigned long ninv)597 sp_NormalizedLazyPrepMulModPreconWithRem(unsigned long& rres, long b, long n, unsigned long ninv)
598 {
599 unsigned long H = cast_unsigned(b) << 2;
600 unsigned long Q = ll_mul_hi(H, (ninv << 1)) + H;
601 unsigned long rr = -Q*cast_unsigned(n); // r in [0..3*n)
602
603 long r = sp_CorrectExcessQuo(Q, rr, n);
604 r = sp_CorrectExcessQuo(Q, r, n);
605 rres = r;
606 return Q; // NOTE: not shifted
607 }
608
609 static inline unsigned long
sp_NormalizedLazyPrepMulModPrecon(long b,long n,unsigned long ninv)610 sp_NormalizedLazyPrepMulModPrecon(long b, long n, unsigned long ninv)
611 {
612 unsigned long H = cast_unsigned(b) << 2;
613 unsigned long Q = ll_mul_hi(H, (ninv << 1)) + H;
614 unsigned long rr = -Q*cast_unsigned(n); // r in [0..3*n)
615 Q += 2L + sp_SignMask(rr-n) + sp_SignMask(rr-2*n);
616 return Q; // NOTE: not shifted
617 }
618
619
620 #endif
621
622
623 static inline unsigned long
LazyPrepMulModPrecon(long b,long n,sp_inverse ninv)624 LazyPrepMulModPrecon(long b, long n, sp_inverse ninv)
625 {
626 return sp_NormalizedLazyPrepMulModPrecon(b << ninv.shamt, n << ninv.shamt, ninv.inv) << (NTL_BITS_PER_LONG-NTL_SP_NBITS-2);
627 }
628
629
630 static inline unsigned long
LazyPrepMulModPreconWithRem(unsigned long & rres,long b,long n,sp_inverse ninv)631 LazyPrepMulModPreconWithRem(unsigned long& rres, long b, long n, sp_inverse ninv)
632 {
633 unsigned long qq, rr;
634 qq = sp_NormalizedLazyPrepMulModPreconWithRem(rr, b << ninv.shamt, n << ninv.shamt, ninv.inv);
635 rres = rr >> ninv.shamt;
636 return qq << (NTL_BITS_PER_LONG-NTL_SP_NBITS-2);
637 }
638
639
640
641
642
643
644
645
646 #elif (NTL_BITS_PER_LONG - NTL_SP_NBITS >= 4 && NTL_WIDE_DOUBLE_PRECISION - NTL_SP_NBITS >= 4)
647
648
649 // slightly faster functions, which should kick in on x86-64, where
650 // NTL_BITS_PER_LONG == 64
651 // NTL_SP_NBITS == 60 (another reason for holding this back to 60 bits)
652 // NTL_WIDE_DOUBLE_PRECISION == 64
653
654 // DIRT: if the relative error in floating point calcuations (muls and reciprocals)
655 // is <= epsilon, the relative error in the calculations is <= 3*epsilon +
656 // O(epsilon^2), and we require that this relative error is at most
657 // 2^{-(NTL_SP_NBITS+2)}, so it should be pretty safe as long as
658 // epsilon is at most, or not much geater than, 2^{-NTL_WIDE_DOUBLE_PRECISION}.
659
660 static inline
LazyPrepMulModPrecon(long b,long n,wide_double ninv)661 unsigned long LazyPrepMulModPrecon(long b, long n, wide_double ninv)
662 {
663 long q = (long) ( (((wide_double) b) * wide_double(4*NTL_SP_BOUND)) * ninv );
664
665 unsigned long rr = (cast_unsigned(b) << (NTL_SP_NBITS+2))
666 - cast_unsigned(q)*cast_unsigned(n);
667
668 q += sp_SignMask(rr) + sp_SignMask(rr-n) + 1L;
669
670 return cast_unsigned(q) << (NTL_BITS_PER_LONG - NTL_SP_NBITS - 2);
671 }
672
673 static inline
LazyPrepMulModPreconWithRem(unsigned long & rres,long b,long n,wide_double ninv)674 unsigned long LazyPrepMulModPreconWithRem(unsigned long& rres, long b, long n, wide_double ninv)
675 {
676 long q = (long) ( (((wide_double) b) * wide_double(4*NTL_SP_BOUND)) * ninv );
677
678 unsigned long rr = (cast_unsigned(b) << (NTL_SP_NBITS+2))
679 - cast_unsigned(q)*cast_unsigned(n);
680
681 long r = sp_CorrectDeficitQuo(q, rr, n);
682 r = sp_CorrectExcessQuo(q, r, n);
683
684 unsigned long qres = cast_unsigned(q) << (NTL_BITS_PER_LONG - NTL_SP_NBITS - 2);
685 rres = r;
686 return qres;
687 }
688
689 #else
690
691
692 static inline
LazyPrepMulModPrecon(long b,long n,wide_double ninv)693 unsigned long LazyPrepMulModPrecon(long b, long n, wide_double ninv)
694 {
695 long q = (long) ( (((wide_double) b) * wide_double(NTL_SP_BOUND)) * ninv );
696
697 unsigned long rr = (cast_unsigned(b) << (NTL_SP_NBITS))
698 - cast_unsigned(q)*cast_unsigned(n);
699
700 long r = sp_CorrectDeficitQuo(q, rr, n);
701 r = sp_CorrectExcessQuo(q, r, n);
702
703 unsigned long qq = q;
704
705 qq = 2*qq;
706 r = 2*r;
707 r = sp_CorrectExcessQuo(qq, r, n);
708
709 qq = 2*qq;
710 r = 2*r;
711 qq += sp_SignMask(r-n) + 1L;
712
713 return qq << (NTL_BITS_PER_LONG - NTL_SP_NBITS - 2);
714 }
715
716
717
718
719
720 static inline
LazyPrepMulModPreconWithRem(unsigned long & rres,long b,long n,wide_double ninv)721 unsigned long LazyPrepMulModPreconWithRem(unsigned long& rres, long b, long n, wide_double ninv)
722 {
723 long q = (long) ( (((wide_double) b) * wide_double(NTL_SP_BOUND)) * ninv );
724
725 unsigned long rr = (cast_unsigned(b) << (NTL_SP_NBITS))
726 - cast_unsigned(q)*cast_unsigned(n);
727
728 long r = sp_CorrectDeficitQuo(q, rr, n);
729 r = sp_CorrectExcessQuo(q, r, n);
730
731 unsigned long qq = q;
732
733 qq = 2*qq;
734 r = 2*r;
735 r = sp_CorrectExcessQuo(qq, r, n);
736
737 qq = 2*qq;
738 r = 2*r;
739 r = sp_CorrectExcessQuo(qq, r, n);
740
741 rres = r;
742 return qq << (NTL_BITS_PER_LONG - NTL_SP_NBITS - 2);
743 }
744
745 #endif
746
747
748
749 static inline
LazyMulModPreconQuo(unsigned long a,unsigned long b,unsigned long n,unsigned long bninv)750 unsigned long LazyMulModPreconQuo(unsigned long a, unsigned long b,
751 unsigned long n, unsigned long bninv)
752 {
753 unsigned long q = ll_mul_hi(a, bninv);
754 unsigned long r = a*b - q*n;
755 q += sp_SignMask(r-n) + 1L;
756 return q << (NTL_BITS_PER_LONG - NTL_SP_NBITS - 2);
757 }
758
759
760 static inline
LazyMulModPrecon(unsigned long a,unsigned long b,unsigned long n,unsigned long bninv)761 unsigned long LazyMulModPrecon(unsigned long a, unsigned long b,
762 unsigned long n, unsigned long bninv)
763 {
764 unsigned long q = ll_mul_hi(a, bninv);
765 unsigned long res = a*b - q*n;
766 return res;
767 }
768
769
770 typedef long mint_t;
771 typedef unsigned long umint_t;
772 // For readability and to make it easier to adapt this
773 // code to other settings
774
775 static inline
LazyReduce1(umint_t a,mint_t q)776 umint_t LazyReduce1(umint_t a, mint_t q)
777 {
778 return sp_CorrectExcess(mint_t(a), q);
779 }
780
781 static inline
LazyReduce2(umint_t a,mint_t q)782 umint_t LazyReduce2(umint_t a, mint_t q)
783 {
784 return sp_CorrectExcess(a, 2*q);
785 }
786
787
788 // inputs in [0, 2*n), output in [0, 4*n)
789 static inline
LazyAddMod(umint_t a,umint_t b,mint_t n)790 umint_t LazyAddMod(umint_t a, umint_t b, mint_t n)
791 {
792 return a+b;
793 }
794
795 // inputs in [0, 2*n), output in [0, 4*n)
796 static inline
LazySubMod(umint_t a,umint_t b,mint_t n)797 umint_t LazySubMod(umint_t a, umint_t b, mint_t n)
798 {
799 return a-b+2*n;
800 }
801
802 // inputs in [0, 2*n), output in [0, 2*n)
803 static inline
LazyAddMod2(umint_t a,umint_t b,mint_t n)804 umint_t LazyAddMod2(umint_t a, umint_t b, mint_t n)
805 {
806 umint_t r = a+b;
807 return sp_CorrectExcess(r, 2*n);
808 }
809
810 // inputs in [0, 2*n), output in [0, 2*n)
811 static inline
LazySubMod2(umint_t a,umint_t b,mint_t n)812 umint_t LazySubMod2(umint_t a, umint_t b, mint_t n)
813 {
814 umint_t r = a-b;
815 return sp_CorrectDeficit(r, 2*n);
816 }
817
818 #ifdef NTL_AVOID_BRANCHING
819
820 // x, y in [0, 4*m)
821 // returns x + y mod 4*m, in [0, 4*m)
822 inline static umint_t
LazyAddMod4(umint_t x,umint_t y,mint_t m)823 LazyAddMod4(umint_t x, umint_t y, mint_t m)
824 {
825 x = LazyReduce2(x, m);
826 y = LazyReduce2(y, m);
827 return x+y;
828 }
829
830 // x, y in [0, 4*m)
831 // returns x - y mod 4*m, in [0, 4*m)
832 inline static umint_t
LazySubMod4(umint_t x,umint_t y,mint_t m)833 LazySubMod4(umint_t x, umint_t y, mint_t m)
834 {
835 x = LazyReduce2(x, m);
836 y = LazyReduce2(y, m);
837 return x-y+2*m;
838 }
839
840 #else
841
842 static inline umint_t
LazyAddMod4(umint_t x,umint_t y,umint_t m)843 LazyAddMod4(umint_t x, umint_t y, umint_t m)
844 {
845 y = 4*m - y;
846 umint_t z = x - y;
847 z += (x < y) ? 4*m : 0;
848 return z;
849 }
850
851
852 static inline umint_t
LazySubMod4(umint_t x,umint_t y,umint_t m)853 LazySubMod4(umint_t x, umint_t y, umint_t m)
854 {
855 umint_t z = x - y;
856 z += (x < y) ? 4*m : 0;
857 return z;
858 }
859
860 #endif
861
862 // Input and output in [0, 4*n)
863 static inline umint_t
LazyDoubleMod4(umint_t a,mint_t n)864 LazyDoubleMod4(umint_t a, mint_t n)
865 {
866 return 2 * LazyReduce2(a, n);
867 }
868
869 // Input and output in [0, 2*n)
870 static inline umint_t
LazyDoubleMod2(umint_t a,mint_t n)871 LazyDoubleMod2(umint_t a, mint_t n)
872 {
873 return 2 * LazyReduce1(a, n);
874 }
875
ComputeMultipliers(Vec<FFTVectorPair> & v,long k,mint_t q,mulmod_t qinv,const mint_t * root)876 void ComputeMultipliers(Vec<FFTVectorPair>& v, long k, mint_t q, mulmod_t qinv, const mint_t* root)
877 {
878
879 long old_len = v.length();
880 v.SetLength(k+1);
881
882 for (long s = max(old_len, 1); s <= k; s++) {
883 v[s].wtab_precomp.SetLength(1L << (s-1));
884 v[s].wqinvtab_precomp.SetLength(1L << (s-1));
885 }
886
887 if (k >= 1) {
888 v[1].wtab_precomp[0] = 1;
889 v[1].wqinvtab_precomp[0] = LazyPrepMulModPrecon(1, q, qinv);
890 }
891
892 if (k >= 2) {
893 v[2].wtab_precomp[0] = v[1].wtab_precomp[0];
894 v[2].wtab_precomp[1] = root[2];
895 v[2].wqinvtab_precomp[0] = v[1].wqinvtab_precomp[0];
896 v[2].wqinvtab_precomp[1] = LazyPrepMulModPrecon(root[2], q, qinv);
897 }
898
899 for (long s = 3; s <= k; s++) {
900 long m = 1L << s;
901 long m_half = 1L << (s-1);
902 long m_fourth = 1L << (s-2);
903 mint_t* NTL_RESTRICT wtab = v[s].wtab_precomp.elts();
904 mint_t* NTL_RESTRICT wtab1 = v[s-1].wtab_precomp.elts();
905 mulmod_precon_t* NTL_RESTRICT wqinvtab = v[s].wqinvtab_precomp.elts();
906 mulmod_precon_t* NTL_RESTRICT wqinvtab1 = v[s-1].wqinvtab_precomp.elts();
907
908 mint_t w = root[s];
909 umint_t wqinv_rem;
910 mulmod_precon_t wqinv = LazyPrepMulModPreconWithRem(wqinv_rem, w, q, qinv);
911
912
913 for (long i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
914 mint_t w_j = wtab1[j];
915 mulmod_precon_t wqi_j = wqinvtab1[j];
916
917 #if 0
918 mint_t w_i = LazyReduce1(LazyMulModPrecon(w_j, w, q, wqinv), q);
919 mulmod_precon_t wqi_i = LazyMulModPreconQuo(wqinv_rem, w_j, q, wqi_j)
920 + cast_unsigned(w_j)*wqinv;
921 #else
922 // This code sequence makes sure the compiler sees
923 // that the product w_j*wqinv needs to be computed just once
924 ll_type x;
925 ll_mul(x, w_j, wqinv);
926 umint_t hi = ll_get_hi(x);
927 umint_t lo = ll_get_lo(x);
928 umint_t r = cast_unsigned(w_j)*cast_unsigned(w) - hi*cast_unsigned(q);
929
930 mint_t w_i = LazyReduce1(r, q);
931 mulmod_precon_t wqi_i = lo+LazyMulModPreconQuo(wqinv_rem, w_j, q, wqi_j);
932 #endif
933
934 wtab[i-1] = w_j;
935 wqinvtab[i-1] = wqi_j;
936 wtab[i] = w_i;
937 wqinvtab[i] = wqi_i;
938 }
939 }
940
941 #if 0
942 // verify result
943 for (long s = 1; s <= k; s++) {
944 mint_t *wtab = v[s].wtab_precomp.elts();
945 mulmod_precon_t *wqinvtab = v[s].wqinvtab_precomp.elts();
946 long m_half = 1L << (s-1);
947
948 mint_t w = root[s];
949 mint_t w_i = 1;
950 for (long i = 0; i < m_half; i++) {
951 if (wtab[i] != w_i || wqinvtab[i] != LazyPrepMulModPrecon(w_i, q, qinv))
952 Error("bad table entry");
953 w_i = MulMod(w_i, w, q, qinv);
954 }
955 }
956 #endif
957 }
958
959
960 #else
961
962
963 // Hacks to make the LAZY code work with ordinary modular arithmetic
964
965 typedef long mint_t;
966 typedef long umint_t;
967
IdentityMod(mint_t a,mint_t q)968 static inline mint_t IdentityMod(mint_t a, mint_t q) { return a; }
DoubleMod(mint_t a,mint_t q)969 static inline mint_t DoubleMod(mint_t a, mint_t q) { return AddMod(a, a, q); }
970
971 #define LazyPrepMulModPrecon PrepMulModPrecon
972 #define LazyMulModPrecon MulModPrecon
973
974 #define LazyReduce1 IdentityMod
975 #define LazyReduce2 IdentityMod
976 #define LazyAddMod AddMod
977 #define LazySubMod SubMod
978 #define LazyAddMod2 AddMod
979 #define LazySubMod2 SubMod
980 #define LazyAddMod4 AddMod
981 #define LazySubMod4 SubMod
982 #define LazyDoubleMod2 DoubleMod
983 #define LazyDoubleMod4 DoubleMod
984
985
ComputeMultipliers(Vec<FFTVectorPair> & v,long k,mint_t q,mulmod_t qinv,const mint_t * root)986 void ComputeMultipliers(Vec<FFTVectorPair>& v, long k, mint_t q, mulmod_t qinv, const mint_t* root)
987 {
988
989 long old_len = v.length();
990 v.SetLength(k+1);
991
992 for (long s = max(old_len, 1); s <= k; s++) {
993 v[s].wtab_precomp.SetLength(1L << (s-1));
994 v[s].wqinvtab_precomp.SetLength(1L << (s-1));
995 }
996
997 if (k >= 1) {
998 v[1].wtab_precomp[0] = 1;
999 v[1].wqinvtab_precomp[0] = PrepMulModPrecon(1, q, qinv);
1000 }
1001
1002 if (k >= 2) {
1003 v[2].wtab_precomp[0] = v[1].wtab_precomp[0];
1004 v[2].wtab_precomp[1] = root[2];
1005 v[2].wqinvtab_precomp[0] = v[1].wqinvtab_precomp[0];
1006 v[2].wqinvtab_precomp[1] = PrepMulModPrecon(root[2], q, qinv);
1007 }
1008
1009 for (long s = 3; s <= k; s++) {
1010 long m = 1L << s;
1011 long m_half = 1L << (s-1);
1012 long m_fourth = 1L << (s-2);
1013 mint_t* NTL_RESTRICT wtab = v[s].wtab_precomp.elts();
1014 mint_t* NTL_RESTRICT wtab1 = v[s-1].wtab_precomp.elts();
1015 mulmod_precon_t* NTL_RESTRICT wqinvtab = v[s].wqinvtab_precomp.elts();
1016 mulmod_precon_t* NTL_RESTRICT wqinvtab1 = v[s-1].wqinvtab_precomp.elts();
1017
1018 mint_t w = root[s];
1019 mulmod_precon_t wqinv = PrepMulModPrecon(w, q, qinv);
1020
1021
1022 for (long i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
1023 mint_t w_j = wtab1[j];
1024 mulmod_precon_t wqi_j = wqinvtab1[j];
1025
1026 mint_t w_i = MulModPrecon(w_j, w, q, wqinv);
1027 mulmod_precon_t wqi_i = PrepMulModPrecon(w_i, q, qinv);
1028
1029 wtab[i-1] = w_j;
1030 wqinvtab[i-1] = wqi_j;
1031 wtab[i] = w_i;
1032 wqinvtab[i] = wqi_i;
1033 }
1034 }
1035
1036 #if 0
1037 // verify result
1038 for (long s = 1; s <= k; s++) {
1039 mint_t *wtab = v[s].wtab_precomp.elts();
1040 mulmod_precon_t *wqinvtab = v[s].wqinvtab_precomp.elts();
1041 long m_half = 1L << (s-1);
1042
1043 mint_t w = root[s];
1044 mint_t w_i = 1;
1045 for (long i = 0; i < m_half; i++) {
1046 if (wtab[i] != w_i || wqinvtab[i] != PrepMulModPrecon(w_i, q, qinv))
1047 Error("bad table entry");
1048 w_i = MulMod(w_i, w, q, qinv);
1049 }
1050 }
1051 #endif
1052 }
1053
1054 #endif
1055
1056
1057
1058 static
LazyPrecompFFTMultipliers(long k,mint_t q,mulmod_t qinv,const mint_t * root,const FFTMultipliers & tab)1059 void LazyPrecompFFTMultipliers(long k, mint_t q, mulmod_t qinv, const mint_t *root, const FFTMultipliers& tab)
1060 {
1061 if (k < 1) LogicError("LazyPrecompFFTMultipliers: bad input");
1062
1063 do { // NOTE: thread safe lazy init
1064 FFTMultipliers::Builder bld(tab, k+1);
1065 long amt = bld.amt();
1066 if (!amt) break;
1067
1068 long first = k+1-amt;
1069 // initialize entries first..k
1070
1071
1072 for (long s = first; s <= k; s++) {
1073 UniquePtr<FFTVectorPair> item;
1074
1075 if (s == 0) {
1076 bld.move(item); // position 0 not used
1077 continue;
1078 }
1079
1080 if (s == 1) {
1081 item.make();
1082 item->wtab_precomp.SetLength(1);
1083 item->wqinvtab_precomp.SetLength(1);
1084 item->wtab_precomp[0] = 1;
1085 item->wqinvtab_precomp[0] = LazyPrepMulModPrecon(1, q, qinv);
1086 bld.move(item);
1087 continue;
1088 }
1089
1090 item.make();
1091 item->wtab_precomp.SetLength(1L << (s-1));
1092 item->wqinvtab_precomp.SetLength(1L << (s-1));
1093
1094 long m = 1L << s;
1095 long m_half = 1L << (s-1);
1096 long m_fourth = 1L << (s-2);
1097
1098 const mint_t *wtab_last = tab[s-1]->wtab_precomp.elts();
1099 const mulmod_precon_t *wqinvtab_last = tab[s-1]->wqinvtab_precomp.elts();
1100
1101 mint_t *wtab = item->wtab_precomp.elts();
1102 mulmod_precon_t *wqinvtab = item->wqinvtab_precomp.elts();
1103
1104 for (long i = 0; i < m_fourth; i++) {
1105 wtab[i] = wtab_last[i];
1106 wqinvtab[i] = wqinvtab_last[i];
1107 }
1108
1109 mint_t w = root[s];
1110 mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, qinv);
1111
1112 // prepare wtab...
1113
1114 if (s == 2) {
1115 wtab[1] = LazyReduce1(LazyMulModPrecon(wtab[0], w, q, wqinv), q);
1116 wqinvtab[1] = LazyPrepMulModPrecon(wtab[1], q, qinv);
1117 }
1118 else {
1119 long i, j;
1120
1121 i = m_half-1; j = m_fourth-1;
1122 wtab[i-1] = wtab[j];
1123 wqinvtab[i-1] = wqinvtab[j];
1124 wtab[i] = LazyReduce1(LazyMulModPrecon(wtab[i-1], w, q, wqinv), q);
1125
1126 i -= 2; j --;
1127
1128 for (; i >= 0; i -= 2, j --) {
1129 mint_t wp2 = wtab[i+2];
1130 mint_t wm1 = wtab[j];
1131 wqinvtab[i+2] = LazyPrepMulModPrecon(wp2, q, qinv);
1132 wtab[i-1] = wm1;
1133 wqinvtab[i-1] = wqinvtab[j];
1134 wtab[i] = LazyReduce1(LazyMulModPrecon(wm1, w, q, wqinv), q);
1135 }
1136
1137 wqinvtab[1] = LazyPrepMulModPrecon(wtab[1], q, qinv);
1138 }
1139
1140 bld.move(item);
1141 }
1142 } while (0);
1143 }
1144
1145
1146 //===================================================================
1147
1148 // TRUNCATED FFT
1149
1150 // This code is derived from code originally developed
1151 // by David Harvey. I include his original documentation,
1152 // annotated appropriately to highlight differences in
1153 // the implemebtation (see NOTEs).
1154
1155 /*
1156 The DFT is defined as follows.
1157
1158 Let the input sequence be a_0, ..., a_{N-1}.
1159
1160 Let w = standard primitive N-th root of 1, i.e. w = g^(2^FFT62_MAX_LGN / N),
1161 where g = some fixed element of Z/pZ of order 2^FFT62_MAX_LGN.
1162
1163 Let Z = an element of (Z/pZ)^* (twisting parameter).
1164
1165 Then the output sequence is
1166 b_j = \sum_{0 <= i < N} Z^i a_i w^(ij'), for 0 <= j < N,
1167 where j' is the length-lgN bit-reversal of j.
1168
1169 Some of the FFT routines can operate on truncated sequences of certain
1170 "admissible" sizes. A size parameter n is admissible if 1 <= n <= N, and n is
1171 divisible by a certain power of 2. The precise power depends on the recursive
1172 array decomposition of the FFT. The smallest admissible n' >= n can be
1173 obtained via fft62_next_size().
1174 */
1175
1176 // NOTE: the twising parameter is not implemented.
1177 // NOTE: the next admissible size function is called FFTRoundUp,
1178 // and is defined in FFT.h.
1179
1180
1181 /*
1182 Truncated FFT interface is as follows:
1183
1184 xn and yn must be admissible sizes for N.
1185
1186 Input in xp[] is a_0, a_1, ..., a_{xn-1}. Assumes a_i = 0 for xn <= i < N.
1187
1188 Output in yp[] is b_0, ..., b_{yn-1}, i.e. only first yn outputs are computed.
1189
1190 Twisting parameter Z is described by z and lgH. If z == 0, then Z = basic
1191 2^lgH-th root of 1, and must have lgH >= lgN + 1. If z != 0, then Z = z
1192 (and lgH is ignored).
1193
1194 The buffers {xp,xn} and {yp,yn} may overlap, but only if xp == yp.
1195
1196 Inputs are in [0, 2p), outputs are in [0, 2p).
1197
1198 threads = number of OpenMP threads to use.
1199 */
1200
1201
1202
1203 /*
1204 Inverse truncated FFT interface is as follows.
1205
1206 xn and yn must be admissible sizes for N, with yn <= xn.
1207
1208 Input in xp[] is b_0, b_1, ..., b_{yn-1}, N*a_{yn}, ..., N*a_{xn-1}.
1209
1210 Assumes a_i = 0 for xn <= i < N.
1211
1212 Output in yp[] is N*a_0, ..., N*a_{yn-1}.
1213
1214 Twisting parameter Z is described by z and lgH. If z == 0, then Z = basic
1215 2^lgH-th root of 1, and must have lgH >= lgN + 1. If z != 0, then Z = z^(-1)
1216 (and lgH is ignored).
1217
1218 The buffers {xp,xn} and {yp,yn} may overlap, but only if xp == yp.
1219
1220 Inputs are in [0, 4p), outputs are in [0, 4p).
1221
1222 threads = number of OpenMP threads to use.
1223
1224 (note: no function actually implements this interface in full generality!
1225 This is because it is tricky (and not that useful) to implement the twisting
1226 parameter when xn != yn.)
1227 */
1228
1229 // NOTE: threads and twisting parameter are not used here.
1230 // NOTE: the code has been re-written and simplified so that
1231 // everything is done in place, so xp == yp.
1232
1233
1234
1235
1236 //===================================================================
1237
1238
1239
1240
1241
1242
1243 // NOTE: these could be inlined, but I found the code generation
1244 // to be extremely sensitive to seemingly trivial changes,
1245 // so it seems safest to use macros instead.
1246 // w and wqinv are read only once.
1247 // q is read several times.
1248 // xx0, xx1 are read once and written once
1249
1250 #define fwd_butterfly(xx0, xx1, w, q, wqinv) \
1251 do \
1252 { \
1253 umint_t x0_ = xx0; \
1254 umint_t x1_ = xx1; \
1255 umint_t t_ = LazySubMod(x0_, x1_, q); \
1256 xx0 = LazyAddMod2(x0_, x1_, q); \
1257 xx1 = LazyMulModPrecon(t_, w, q, wqinv); \
1258 } \
1259 while (0)
1260
1261 #define fwd_butterfly_neg(xx0, xx1, w, q, wqinv) \
1262 do \
1263 { \
1264 umint_t x0_ = xx0; \
1265 umint_t x1_ = xx1; \
1266 umint_t t_ = LazySubMod(x1_, x0_, q); /* NEG */ \
1267 xx0 = LazyAddMod2(x0_, x1_, q); \
1268 xx1 = LazyMulModPrecon(t_, w, q, wqinv); \
1269 } \
1270 while (0)
1271
1272 #define fwd_butterfly1(xx0, xx1, w, q, wqinv, w1, w1qinv) \
1273 do \
1274 { \
1275 umint_t x0_ = xx0; \
1276 umint_t x1_ = xx1; \
1277 umint_t t_ = LazySubMod(x0_, x1_, q); \
1278 xx0 = LazyAddMod2(x0_, x1_, q); \
1279 xx1 = LazyMulModPrecon(LazyMulModPrecon(t_, w1, q, w1qinv), w, q, wqinv); \
1280 } \
1281 while (0)
1282
1283
1284 #define fwd_butterfly0(xx0, xx1, q) \
1285 do \
1286 { \
1287 umint_t x0_ = xx0; \
1288 umint_t x1_ = xx1; \
1289 xx0 = LazyAddMod2(x0_, x1_, q); \
1290 xx1 = LazySubMod2(x0_, x1_, q); \
1291 } \
1292 while (0)
1293
1294
1295 #define NTL_NEW_FFT_THRESH (11)
1296
1297 struct new_mod_t {
1298 mint_t q;
1299 const mint_t **wtab;
1300 const mulmod_precon_t **wqinvtab;
1301 };
1302
1303
1304
1305
1306
1307 // requires size divisible by 8
1308 static void
new_fft_layer(umint_t * xp,long blocks,long size,const mint_t * NTL_RESTRICT wtab,const mulmod_precon_t * NTL_RESTRICT wqinvtab,mint_t q)1309 new_fft_layer(umint_t* xp, long blocks, long size,
1310 const mint_t* NTL_RESTRICT wtab,
1311 const mulmod_precon_t* NTL_RESTRICT wqinvtab,
1312 mint_t q)
1313 {
1314 size /= 2;
1315
1316 do
1317 {
1318 umint_t* NTL_RESTRICT xp0 = xp;
1319 umint_t* NTL_RESTRICT xp1 = xp + size;
1320
1321 // first 4 butterflies
1322 fwd_butterfly0(xp0[0+0], xp1[0+0], q);
1323 fwd_butterfly(xp0[0+1], xp1[0+1], wtab[0+1], q, wqinvtab[0+1]);
1324 fwd_butterfly(xp0[0+2], xp1[0+2], wtab[0+2], q, wqinvtab[0+2]);
1325 fwd_butterfly(xp0[0+3], xp1[0+3], wtab[0+3], q, wqinvtab[0+3]);
1326
1327 // 4-way unroll
1328 for (long j = 4; j < size; j += 4) {
1329 fwd_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
1330 fwd_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
1331 fwd_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
1332 fwd_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
1333 }
1334
1335 xp += 2 * size;
1336 }
1337 while (--blocks != 0);
1338 }
1339
1340
1341 static void
new_fft_last_two_layers(umint_t * xp,long blocks,const mint_t * wtab,const mulmod_precon_t * wqinvtab,mint_t q)1342 new_fft_last_two_layers(umint_t* xp, long blocks,
1343 const mint_t* wtab, const mulmod_precon_t* wqinvtab,
1344 mint_t q)
1345 {
1346 // 4th root of unity
1347 mint_t w = wtab[1];
1348 mulmod_precon_t wqinv = wqinvtab[1];
1349
1350 do
1351 {
1352 umint_t u0 = xp[0];
1353 umint_t u1 = xp[1];
1354 umint_t u2 = xp[2];
1355 umint_t u3 = xp[3];
1356
1357 umint_t v0 = LazyAddMod2(u0, u2, q);
1358 umint_t v2 = LazySubMod2(u0, u2, q);
1359 umint_t v1 = LazyAddMod2(u1, u3, q);
1360 umint_t t = LazySubMod(u1, u3, q);
1361 umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
1362
1363 xp[0] = LazyAddMod2(v0, v1, q);
1364 xp[1] = LazySubMod2(v0, v1, q);
1365 xp[2] = LazyAddMod2(v2, v3, q);
1366 xp[3] = LazySubMod2(v2, v3, q);
1367
1368 xp += 4;
1369 }
1370 while (--blocks != 0);
1371 }
1372
1373
1374
new_fft_base(umint_t * xp,long lgN,const new_mod_t & mod)1375 void new_fft_base(umint_t* xp, long lgN, const new_mod_t& mod)
1376 {
1377 if (lgN == 0) return;
1378
1379 mint_t q = mod.q;
1380
1381 if (lgN == 1)
1382 {
1383 umint_t x0 = xp[0];
1384 umint_t x1 = xp[1];
1385 xp[0] = LazyAddMod2(x0, x1, q);
1386 xp[1] = LazySubMod2(x0, x1, q);
1387 return;
1388 }
1389
1390 const mint_t** wtab = mod.wtab;
1391 const mulmod_precon_t** wqinvtab = mod.wqinvtab;
1392
1393 long N = 1L << lgN;
1394
1395 for (long j = lgN, size = N, blocks = 1;
1396 j > 2; j--, blocks <<= 1, size >>= 1)
1397 new_fft_layer(xp, blocks, size, wtab[j], wqinvtab[j], q);
1398
1399 new_fft_last_two_layers(xp, N/4, wtab[2], wqinvtab[2], q);
1400 }
1401
1402
1403 // Implements the truncated FFT interface, described above.
1404 // All computations done in place, and xp should point to
1405 // an array of size N, all of which may be overwitten
1406 // during the computation.
1407 static
new_fft_short(umint_t * xp,long yn,long xn,long lgN,const new_mod_t & mod)1408 void new_fft_short(umint_t* xp, long yn, long xn, long lgN,
1409 const new_mod_t& mod)
1410 {
1411 long N = 1L << lgN;
1412
1413 if (yn == N)
1414 {
1415 if (xn == N && lgN <= NTL_NEW_FFT_THRESH)
1416 {
1417 // no truncation
1418 new_fft_base(xp, lgN, mod);
1419 return;
1420 }
1421 }
1422
1423 // divide-and-conquer algorithm
1424
1425 long half = N >> 1;
1426 mint_t q = mod.q;
1427
1428 if (yn <= half)
1429 {
1430 if (xn <= half)
1431 {
1432 new_fft_short(xp, yn, xn, lgN - 1, mod);
1433 }
1434 else
1435 {
1436 xn -= half;
1437
1438 // (X, Y) -> X + Y
1439 for (long j = 0; j < xn; j++)
1440 xp[j] = LazyAddMod2(xp[j], xp[j + half], q);
1441
1442 new_fft_short(xp, yn, half, lgN - 1, mod);
1443 }
1444 }
1445 else
1446 {
1447 yn -= half;
1448
1449 umint_t* NTL_RESTRICT xp0 = xp;
1450 umint_t* NTL_RESTRICT xp1 = xp + half;
1451 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
1452 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
1453
1454 if (xn <= half)
1455 {
1456 // X -> (X, w*X)
1457 for (long j = 0; j < xn; j++)
1458 xp1[j] = LazyMulModPrecon(xp0[j], wtab[j], q, wqinvtab[j]);
1459
1460 new_fft_short(xp0, half, xn, lgN - 1, mod);
1461 new_fft_short(xp1, yn, xn, lgN - 1, mod);
1462 }
1463 else
1464 {
1465 xn -= half;
1466
1467 // (X, Y) -> (X + Y, w*(X - Y))
1468 // DIRT: assumes xn is a multiple of 4
1469 fwd_butterfly0(xp0[0], xp1[0], q);
1470 fwd_butterfly(xp0[1], xp1[1], wtab[1], q, wqinvtab[1]);
1471 fwd_butterfly(xp0[2], xp1[2], wtab[2], q, wqinvtab[2]);
1472 fwd_butterfly(xp0[3], xp1[3], wtab[3], q, wqinvtab[3]);
1473 for (long j = 4; j < xn; j+=4) {
1474 fwd_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
1475 fwd_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
1476 fwd_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
1477 fwd_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
1478 }
1479
1480 // X -> (X, w*X)
1481 for (long j = xn; j < half; j++)
1482 xp1[j] = LazyMulModPrecon(xp0[j], wtab[j], q, wqinvtab[j]);
1483
1484 new_fft_short(xp0, half, half, lgN - 1, mod);
1485 new_fft_short(xp1, yn, half, lgN - 1, mod);
1486 }
1487 }
1488 }
1489
1490 static
new_fft_short_notab(umint_t * xp,long yn,long xn,long lgN,const new_mod_t & mod,mint_t w,mulmod_precon_t wqinv)1491 void new_fft_short_notab(umint_t* xp, long yn, long xn, long lgN,
1492 const new_mod_t& mod, mint_t w, mulmod_precon_t wqinv)
1493 // This version assumes that we only have tables up to level lgN-1,
1494 // and w generates the values at level lgN.
1495 // DIRT: requires xn even
1496 {
1497 long N = 1L << lgN;
1498
1499 // divide-and-conquer algorithm
1500
1501 long half = N >> 1;
1502 mint_t q = mod.q;
1503
1504 if (yn <= half)
1505 {
1506 if (xn <= half)
1507 {
1508 new_fft_short(xp, yn, xn, lgN - 1, mod);
1509 }
1510 else
1511 {
1512 xn -= half;
1513
1514 // (X, Y) -> X + Y
1515 for (long j = 0; j < xn; j++)
1516 xp[j] = LazyAddMod2(xp[j], xp[j + half], q);
1517
1518 new_fft_short(xp, yn, half, lgN - 1, mod);
1519 }
1520 }
1521 else
1522 {
1523 yn -= half;
1524
1525 umint_t* NTL_RESTRICT xp0 = xp;
1526 umint_t* NTL_RESTRICT xp1 = xp + half;
1527 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN-1];
1528 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN-1];
1529
1530 if (xn <= half)
1531 {
1532 // X -> (X, w*X)
1533 for (long j = 0, j_half = 0; j < xn; j+=2, j_half++) {
1534 xp1[j] = LazyMulModPrecon(xp0[j], wtab[j_half], q, wqinvtab[j_half]);
1535 xp1[j+1] = LazyMulModPrecon(LazyMulModPrecon(xp0[j+1], w, q, wqinv),
1536 wtab[j_half], q, wqinvtab[j_half]);
1537 }
1538
1539 new_fft_short(xp0, half, xn, lgN - 1, mod);
1540 new_fft_short(xp1, yn, xn, lgN - 1, mod);
1541 }
1542 else
1543 {
1544 xn -= half;
1545
1546 // (X, Y) -> (X + Y, w*(X - Y))
1547 fwd_butterfly0(xp0[0], xp1[0], q);
1548 fwd_butterfly(xp0[1], xp1[1], w, q, wqinv);
1549 long j = 2;
1550 long j_half = 1;
1551 for (; j < xn; j+=2, j_half++) {
1552 fwd_butterfly(xp0[j], xp1[j], wtab[j_half], q, wqinvtab[j_half]);
1553 fwd_butterfly1(xp0[j+1], xp1[j+1], wtab[j_half], q, wqinvtab[j_half], w, wqinv);
1554 }
1555
1556 // X -> (X, w*X)
1557 for (; j < half; j+=2, j_half++) {
1558 xp1[j] = LazyMulModPrecon(xp0[j], wtab[j_half], q, wqinvtab[j_half]);
1559 xp1[j+1] = LazyMulModPrecon(LazyMulModPrecon(xp0[j+1], w, q, wqinv),
1560 wtab[j_half], q, wqinvtab[j_half]);
1561 }
1562
1563 new_fft_short(xp0, half, half, lgN - 1, mod);
1564 new_fft_short(xp1, yn, half, lgN - 1, mod);
1565 }
1566 }
1567 }
1568
1569
1570 //=====
1571
1572
1573 // NOTE: these "flipped" routines perform the same
1574 // functions as their normal, "unflipped" counter-parts,
1575 // except that they work with inverted roots.
1576 // They also perform no truncation, just to keep things simple.
1577 // All of this is necessary only to implement the UpdateMap
1578 // routines for ZZ_pX and zz_pX.
1579
1580 // requires size divisible by 8
1581 static void
new_fft_layer_flipped(umint_t * xp,long blocks,long size,const mint_t * wtab,const mulmod_precon_t * wqinvtab,mint_t q)1582 new_fft_layer_flipped(umint_t* xp, long blocks, long size,
1583 const mint_t* wtab,
1584 const mulmod_precon_t* wqinvtab,
1585 mint_t q)
1586 {
1587 size /= 2;
1588
1589 const mint_t* NTL_RESTRICT wtab1 = wtab + size;
1590 const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + size;
1591
1592 do
1593 {
1594 umint_t* NTL_RESTRICT xp0 = xp;
1595 umint_t* NTL_RESTRICT xp1 = xp + size;
1596
1597 // first 4 butterflies
1598 fwd_butterfly0(xp0[0+0], xp1[0+0], q);
1599 fwd_butterfly_neg(xp0[0+1], xp1[0+1], wtab1[-(0+1)], q, wqinvtab1[-(0+1)]);
1600 fwd_butterfly_neg(xp0[0+2], xp1[0+2], wtab1[-(0+2)], q, wqinvtab1[-(0+2)]);
1601 fwd_butterfly_neg(xp0[0+3], xp1[0+3], wtab1[-(0+3)], q, wqinvtab1[-(0+3)]);
1602
1603 // 4-way unroll
1604 for (long j = 4; j < size; j += 4) {
1605 fwd_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
1606 fwd_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
1607 fwd_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
1608 fwd_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
1609 }
1610
1611 xp += 2 * size;
1612 }
1613 while (--blocks != 0);
1614 }
1615
1616
1617
1618 static void
new_fft_last_two_layers_flipped(umint_t * xp,long blocks,const mint_t * wtab,const mulmod_precon_t * wqinvtab,mint_t q)1619 new_fft_last_two_layers_flipped(umint_t* xp, long blocks,
1620 const mint_t* wtab, const mulmod_precon_t* wqinvtab,
1621 mint_t q)
1622 {
1623 // 4th root of unity
1624 mint_t w = wtab[1];
1625 mulmod_precon_t wqinv = wqinvtab[1];
1626
1627 do
1628 {
1629 umint_t u0 = xp[0];
1630 umint_t u1 = xp[1];
1631 umint_t u2 = xp[2];
1632 umint_t u3 = xp[3];
1633
1634 umint_t v0 = LazyAddMod2(u0, u2, q);
1635 umint_t v2 = LazySubMod2(u0, u2, q);
1636 umint_t v1 = LazyAddMod2(u1, u3, q);
1637 umint_t t = LazySubMod(u3, u1, q); // NEG
1638 umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
1639
1640 xp[0] = LazyAddMod2(v0, v1, q);
1641 xp[1] = LazySubMod2(v0, v1, q);
1642 xp[2] = LazyAddMod2(v2, v3, q);
1643 xp[3] = LazySubMod2(v2, v3, q);
1644
1645 xp += 4;
1646 }
1647 while (--blocks != 0);
1648 }
1649
1650
1651
new_fft_base_flipped(umint_t * xp,long lgN,const new_mod_t & mod)1652 void new_fft_base_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
1653 {
1654 if (lgN == 0) return;
1655
1656 mint_t q = mod.q;
1657
1658 if (lgN == 1)
1659 {
1660 umint_t x0 = xp[0];
1661 umint_t x1 = xp[1];
1662 xp[0] = LazyAddMod2(x0, x1, q);
1663 xp[1] = LazySubMod2(x0, x1, q);
1664 return;
1665 }
1666
1667 const mint_t** wtab = mod.wtab;
1668 const mulmod_precon_t** wqinvtab = mod.wqinvtab;
1669
1670 long N = 1L << lgN;
1671
1672 for (long j = lgN, size = N, blocks = 1;
1673 j > 2; j--, blocks <<= 1, size >>= 1)
1674 new_fft_layer_flipped(xp, blocks, size, wtab[j], wqinvtab[j], q);
1675
1676 new_fft_last_two_layers_flipped(xp, N/4, wtab[2], wqinvtab[2], q);
1677 }
1678
1679
1680 static
new_fft_short_flipped(umint_t * xp,long lgN,const new_mod_t & mod)1681 void new_fft_short_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
1682 {
1683 long N = 1L << lgN;
1684
1685 if (lgN <= NTL_NEW_FFT_THRESH)
1686 {
1687 new_fft_base_flipped(xp, lgN, mod);
1688 return;
1689 }
1690
1691 // divide-and-conquer algorithm
1692
1693 long half = N >> 1;
1694 mint_t q = mod.q;
1695
1696 umint_t* NTL_RESTRICT xp0 = xp;
1697 umint_t* NTL_RESTRICT xp1 = xp + half;
1698 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN] + half;
1699 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN] + half;
1700
1701 // (X, Y) -> (X + Y, w*(X - Y))
1702
1703 fwd_butterfly0(xp0[0], xp1[0], q);
1704 fwd_butterfly_neg(xp0[1], xp1[1], wtab[-1], q, wqinvtab[-1]);
1705 fwd_butterfly_neg(xp0[2], xp1[2], wtab[-2], q, wqinvtab[-2]);
1706 fwd_butterfly_neg(xp0[3], xp1[3], wtab[-3], q, wqinvtab[-3]);
1707 for (long j = 4; j < half; j+=4) {
1708 fwd_butterfly_neg(xp0[j+0], xp1[j+0], wtab[-(j+0)], q, wqinvtab[-(j+0)]);
1709 fwd_butterfly_neg(xp0[j+1], xp1[j+1], wtab[-(j+1)], q, wqinvtab[-(j+1)]);
1710 fwd_butterfly_neg(xp0[j+2], xp1[j+2], wtab[-(j+2)], q, wqinvtab[-(j+2)]);
1711 fwd_butterfly_neg(xp0[j+3], xp1[j+3], wtab[-(j+3)], q, wqinvtab[-(j+3)]);
1712 }
1713
1714 new_fft_short_flipped(xp0, lgN - 1, mod);
1715 new_fft_short_flipped(xp1, lgN - 1, mod);
1716 }
1717
1718
1719
1720 // IFFT (inverse truncated FFT)
1721
1722
1723 #define inv_butterfly0(xx0, xx1, q) \
1724 do \
1725 { \
1726 umint_t x0_ = LazyReduce2(xx0, q); \
1727 umint_t x1_ = LazyReduce2(xx1, q); \
1728 xx0 = LazyAddMod(x0_, x1_, q); \
1729 xx1 = LazySubMod(x0_, x1_, q); \
1730 } while (0)
1731
1732
1733 #define inv_butterfly_neg(xx0, xx1, w, q, wqinv) \
1734 do \
1735 { \
1736 umint_t x0_ = LazyReduce2(xx0, q); \
1737 umint_t x1_ = xx1; \
1738 umint_t t_ = LazyMulModPrecon(x1_, w, q, wqinv); \
1739 xx0 = LazySubMod(x0_, t_, q); /* NEG */ \
1740 xx1 = LazyAddMod(x0_, t_, q); /* NEG */ \
1741 } while (0)
1742
1743 #define inv_butterfly(xx0, xx1, w, q, wqinv) \
1744 do \
1745 { \
1746 umint_t x0_ = LazyReduce2(xx0, q); \
1747 umint_t x1_ = xx1; \
1748 umint_t t_ = LazyMulModPrecon(x1_, w, q, wqinv); \
1749 xx0 = LazyAddMod(x0_, t_, q); \
1750 xx1 = LazySubMod(x0_, t_, q); \
1751 } while (0)
1752
1753 #define inv_butterfly1_neg(xx0, xx1, w, q, wqinv, w1, w1qinv) \
1754 do \
1755 { \
1756 umint_t x0_ = LazyReduce2(xx0, q); \
1757 umint_t x1_ = xx1; \
1758 umint_t t_ = LazyMulModPrecon(LazyMulModPrecon(x1_, w1, q, w1qinv), w, q, wqinv); \
1759 xx0 = LazySubMod(x0_, t_, q); /* NEG */ \
1760 xx1 = LazyAddMod(x0_, t_, q); /* NEG */ \
1761 } while (0)
1762
1763
1764 static
1765 void new_ifft_short2(umint_t* yp, long yn, long lgN, const new_mod_t& mod);
1766
1767
1768
1769 // requires size divisible by 8
1770 static void
new_ifft_layer(umint_t * xp,long blocks,long size,const mint_t * wtab,const mulmod_precon_t * wqinvtab,mint_t q)1771 new_ifft_layer(umint_t* xp, long blocks, long size,
1772 const mint_t* wtab,
1773 const mulmod_precon_t* wqinvtab, mint_t q)
1774 {
1775
1776 size /= 2;
1777 const mint_t* NTL_RESTRICT wtab1 = wtab + size;
1778 const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + size;
1779
1780 do
1781 {
1782
1783 umint_t* NTL_RESTRICT xp0 = xp;
1784 umint_t* NTL_RESTRICT xp1 = xp + size;
1785
1786
1787 // first 4 butterflies
1788 inv_butterfly0(xp0[0], xp1[0], q);
1789 inv_butterfly_neg(xp0[1], xp1[1], wtab1[-1], q, wqinvtab1[-1]);
1790 inv_butterfly_neg(xp0[2], xp1[2], wtab1[-2], q, wqinvtab1[-2]);
1791 inv_butterfly_neg(xp0[3], xp1[3], wtab1[-3], q, wqinvtab1[-3]);
1792
1793 // 4-way unroll
1794 for (long j = 4; j < size; j+= 4) {
1795 inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
1796 inv_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
1797 inv_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
1798 inv_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
1799 }
1800
1801 xp += 2 * size;
1802 }
1803 while (--blocks != 0);
1804 }
1805
1806
1807 static void
new_ifft_first_two_layers(umint_t * xp,long blocks,const mint_t * wtab,const mulmod_precon_t * wqinvtab,mint_t q)1808 new_ifft_first_two_layers(umint_t* xp, long blocks, const mint_t* wtab,
1809 const mulmod_precon_t* wqinvtab, mint_t q)
1810 {
1811 // 4th root of unity
1812 mint_t w = wtab[1];
1813 mulmod_precon_t wqinv = wqinvtab[1];
1814
1815 do
1816 {
1817 umint_t u0 = LazyReduce2(xp[0], q);
1818 umint_t u1 = LazyReduce2(xp[1], q);
1819 umint_t u2 = LazyReduce2(xp[2], q);
1820 umint_t u3 = LazyReduce2(xp[3], q);
1821
1822 umint_t v0 = LazyAddMod2(u0, u1, q);
1823 umint_t v1 = LazySubMod2(u0, u1, q);
1824 umint_t v2 = LazyAddMod2(u2, u3, q);
1825 umint_t t = LazySubMod(u2, u3, q);
1826 umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
1827
1828 xp[0] = LazyAddMod(v0, v2, q);
1829 xp[2] = LazySubMod(v0, v2, q);
1830 xp[1] = LazySubMod(v1, v3, q); // NEG
1831 xp[3] = LazyAddMod(v1, v3, q); // NEG
1832
1833 xp += 4;
1834 }
1835 while (--blocks != 0);
1836 }
1837
1838
1839
1840 static
new_ifft_base(umint_t * xp,long lgN,const new_mod_t & mod)1841 void new_ifft_base(umint_t* xp, long lgN, const new_mod_t& mod)
1842 {
1843 if (lgN == 0) return;
1844
1845 mint_t q = mod.q;
1846
1847 if (lgN == 1)
1848 {
1849 umint_t x0 = LazyReduce2(xp[0], q);
1850 umint_t x1 = LazyReduce2(xp[1], q);
1851 xp[0] = LazyAddMod(x0, x1, q);
1852 xp[1] = LazySubMod(x0, x1, q);
1853 return;
1854 }
1855
1856 const mint_t** wtab = mod.wtab;
1857 const mulmod_precon_t** wqinvtab = mod.wqinvtab;
1858
1859 long blocks = 1L << (lgN - 2);
1860 new_ifft_first_two_layers(xp, blocks, wtab[2], wqinvtab[2], q);
1861 blocks >>= 1;
1862
1863 long size = 8;
1864 for (long j = 3; j <= lgN; j++, blocks >>= 1, size <<= 1)
1865 new_ifft_layer(xp, blocks, size, wtab[j], wqinvtab[j], q);
1866 }
1867
1868
1869 static
new_ifft_short1(umint_t * xp,long yn,long lgN,const new_mod_t & mod)1870 void new_ifft_short1(umint_t* xp, long yn, long lgN, const new_mod_t& mod)
1871
1872 // Implements truncated inverse FFT interface, but with xn==yn.
1873 // All computations are done in place.
1874
1875 {
1876 long N = 1L << lgN;
1877
1878 if (yn == N && lgN <= NTL_NEW_FFT_THRESH)
1879 {
1880 // no truncation
1881 new_ifft_base(xp, lgN, mod);
1882 return;
1883 }
1884
1885 // divide-and-conquer algorithm
1886
1887 long half = N >> 1;
1888 mint_t q = mod.q;
1889
1890 if (yn <= half)
1891 {
1892 // X -> 2X
1893 for (long j = 0; j < yn; j++)
1894 xp[j] = LazyDoubleMod4(xp[j], q);
1895
1896 new_ifft_short1(xp, yn, lgN - 1, mod);
1897 }
1898 else
1899 {
1900 umint_t* NTL_RESTRICT xp0 = xp;
1901 umint_t* NTL_RESTRICT xp1 = xp + half;
1902 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
1903 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
1904
1905 new_ifft_short1(xp0, half, lgN - 1, mod);
1906
1907 yn -= half;
1908
1909 // X -> (2X, w*X)
1910 for (long j = yn; j < half; j++)
1911 {
1912 umint_t x0 = xp0[j];
1913 xp0[j] = LazyDoubleMod4(x0, q);
1914 xp1[j] = LazyMulModPrecon(x0, wtab[j], q, wqinvtab[j]);
1915 }
1916
1917 new_ifft_short2(xp1, yn, lgN - 1, mod);
1918
1919 // (X, Y) -> (X + Y/w, X - Y/w)
1920 {
1921 const mint_t* NTL_RESTRICT wtab1 = wtab + half;
1922 const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + half;
1923
1924 // DIRT: assumes yn is a multiple of 4
1925 inv_butterfly0(xp0[0], xp1[0], q);
1926 inv_butterfly_neg(xp0[1], xp1[1], wtab1[-1], q, wqinvtab1[-1]);
1927 inv_butterfly_neg(xp0[2], xp1[2], wtab1[-2], q, wqinvtab1[-2]);
1928 inv_butterfly_neg(xp0[3], xp1[3], wtab1[-3], q, wqinvtab1[-3]);
1929 for (long j = 4; j < yn; j+=4) {
1930 inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
1931 inv_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
1932 inv_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
1933 inv_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
1934 }
1935 }
1936 }
1937 }
1938
1939
1940 static
new_ifft_short1_notab(umint_t * xp,long yn,long lgN,const new_mod_t & mod,mint_t w,mulmod_precon_t wqinv,mint_t iw,mulmod_precon_t iwqinv)1941 void new_ifft_short1_notab(umint_t* xp, long yn, long lgN, const new_mod_t& mod,
1942 mint_t w, mulmod_precon_t wqinv,
1943 mint_t iw, mulmod_precon_t iwqinv)
1944 // This version assumes that we only have tables up to level lgN-1,
1945 // and w generates the values at level lgN.
1946 // DIRT: requires yn even
1947 {
1948 long N = 1L << lgN;
1949
1950 // divide-and-conquer algorithm
1951
1952 long half = N >> 1;
1953 mint_t q = mod.q;
1954
1955 if (yn <= half)
1956 {
1957 // X -> 2X
1958 for (long j = 0; j < yn; j++)
1959 xp[j] = LazyDoubleMod4(xp[j], q);
1960
1961 new_ifft_short1(xp, yn, lgN - 1, mod);
1962 }
1963 else
1964 {
1965 umint_t* NTL_RESTRICT xp0 = xp;
1966 umint_t* NTL_RESTRICT xp1 = xp + half;
1967 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN-1];
1968 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN-1];
1969
1970 new_ifft_short1(xp0, half, lgN - 1, mod);
1971
1972 yn -= half;
1973
1974 // X -> (2X, w*X)
1975 for (long j = yn, j_half = yn/2; j < half; j+=2, j_half++) {
1976 {
1977 umint_t x0 = xp0[j+0];
1978 xp0[j+0] = LazyDoubleMod4(x0, q);
1979 xp1[j+0] = LazyMulModPrecon(x0, wtab[j_half], q, wqinvtab[j_half]);
1980 }
1981 {
1982 umint_t x0 = xp0[j+1];
1983 xp0[j+1] = LazyDoubleMod4(x0, q);
1984 xp1[j+1] = LazyMulModPrecon(LazyMulModPrecon(x0, w, q, wqinv),
1985 wtab[j_half], q, wqinvtab[j_half]);
1986 }
1987 }
1988
1989 new_ifft_short2(xp1, yn, lgN - 1, mod);
1990
1991 // (X, Y) -> (X + Y/w, X - Y/w)
1992 {
1993 const mint_t* NTL_RESTRICT wtab1 = wtab + half/2;
1994 const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + half/2;
1995
1996 inv_butterfly0(xp0[0], xp1[0], q);
1997 inv_butterfly(xp0[1], xp1[1], iw, q, iwqinv);
1998 for (long j = 2, j_half = 1; j < yn; j+=2, j_half++) {
1999 inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-j_half], q, wqinvtab1[-j_half]);
2000 inv_butterfly1_neg(xp0[j+1], xp1[j+1], wtab1[-j_half], q, wqinvtab1[-j_half], iw, iwqinv);
2001 }
2002 }
2003 }
2004 }
2005
2006
2007
2008 //=========
2009
2010
2011 // requires size divisible by 8
2012 static void
new_ifft_layer_flipped(umint_t * xp,long blocks,long size,const mint_t * NTL_RESTRICT wtab,const mulmod_precon_t * NTL_RESTRICT wqinvtab,mint_t q)2013 new_ifft_layer_flipped(umint_t* xp, long blocks, long size,
2014 const mint_t* NTL_RESTRICT wtab,
2015 const mulmod_precon_t* NTL_RESTRICT wqinvtab, mint_t q)
2016 {
2017
2018 size /= 2;
2019
2020 do
2021 {
2022
2023 umint_t* NTL_RESTRICT xp0 = xp;
2024 umint_t* NTL_RESTRICT xp1 = xp + size;
2025
2026
2027 // first 4 butterflies
2028 inv_butterfly0(xp0[0], xp1[0], q);
2029 inv_butterfly(xp0[1], xp1[1], wtab[1], q, wqinvtab[1]);
2030 inv_butterfly(xp0[2], xp1[2], wtab[2], q, wqinvtab[2]);
2031 inv_butterfly(xp0[3], xp1[3], wtab[3], q, wqinvtab[3]);
2032
2033 // 4-way unroll
2034 for (long j = 4; j < size; j+= 4) {
2035 inv_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
2036 inv_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
2037 inv_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
2038 inv_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
2039 }
2040
2041 xp += 2 * size;
2042 }
2043 while (--blocks != 0);
2044 }
2045
2046
2047 static void
new_ifft_first_two_layers_flipped(umint_t * xp,long blocks,const mint_t * wtab,const mulmod_precon_t * wqinvtab,mint_t q)2048 new_ifft_first_two_layers_flipped(umint_t* xp, long blocks, const mint_t* wtab,
2049 const mulmod_precon_t* wqinvtab, mint_t q)
2050 {
2051 // 4th root of unity
2052 mint_t w = wtab[1];
2053 mulmod_precon_t wqinv = wqinvtab[1];
2054
2055 do
2056 {
2057 umint_t u0 = LazyReduce2(xp[0], q);
2058 umint_t u1 = LazyReduce2(xp[1], q);
2059 umint_t u2 = LazyReduce2(xp[2], q);
2060 umint_t u3 = LazyReduce2(xp[3], q);
2061
2062 umint_t v0 = LazyAddMod2(u0, u1, q);
2063 umint_t v1 = LazySubMod2(u0, u1, q);
2064 umint_t v2 = LazyAddMod2(u2, u3, q);
2065 umint_t t = LazySubMod(u2, u3, q);
2066 umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
2067
2068 xp[0] = LazyAddMod(v0, v2, q);
2069 xp[2] = LazySubMod(v0, v2, q);
2070 xp[1] = LazyAddMod(v1, v3, q);
2071 xp[3] = LazySubMod(v1, v3, q);
2072
2073 xp += 4;
2074 }
2075 while (--blocks != 0);
2076 }
2077
2078
2079
2080 static
new_ifft_base_flipped(umint_t * xp,long lgN,const new_mod_t & mod)2081 void new_ifft_base_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
2082 {
2083 if (lgN == 0) return;
2084
2085 mint_t q = mod.q;
2086
2087 if (lgN == 1)
2088 {
2089 umint_t x0 = LazyReduce2(xp[0], q);
2090 umint_t x1 = LazyReduce2(xp[1], q);
2091 xp[0] = LazyAddMod(x0, x1, q);
2092 xp[1] = LazySubMod(x0, x1, q);
2093 return;
2094 }
2095
2096 const mint_t** wtab = mod.wtab;
2097 const mulmod_precon_t** wqinvtab = mod.wqinvtab;
2098
2099 long blocks = 1L << (lgN - 2);
2100 new_ifft_first_two_layers_flipped(xp, blocks, wtab[2], wqinvtab[2], q);
2101 blocks >>= 1;
2102
2103 long size = 8;
2104 for (long j = 3; j <= lgN; j++, blocks >>= 1, size <<= 1)
2105 new_ifft_layer_flipped(xp, blocks, size, wtab[j], wqinvtab[j], q);
2106 }
2107
2108
2109 static
new_ifft_short1_flipped(umint_t * xp,long lgN,const new_mod_t & mod)2110 void new_ifft_short1_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
2111 {
2112 long N = 1L << lgN;
2113
2114 if (lgN <= NTL_NEW_FFT_THRESH)
2115 {
2116 new_ifft_base_flipped(xp, lgN, mod);
2117 return;
2118 }
2119
2120 // divide-and-conquer algorithm
2121
2122 long half = N >> 1;
2123 mint_t q = mod.q;
2124
2125 umint_t* NTL_RESTRICT xp0 = xp;
2126 umint_t* NTL_RESTRICT xp1 = xp + half;
2127 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
2128 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
2129
2130 new_ifft_short1_flipped(xp0, lgN - 1, mod);
2131 new_ifft_short1_flipped(xp1, lgN - 1, mod);
2132
2133 // (X, Y) -> (X + Y*w, X - Y*w)
2134
2135 inv_butterfly0(xp0[0], xp1[0], q);
2136 inv_butterfly(xp0[1], xp1[1], wtab[1], q, wqinvtab[1]);
2137 inv_butterfly(xp0[2], xp1[2], wtab[2], q, wqinvtab[2]);
2138 inv_butterfly(xp0[3], xp1[3], wtab[3], q, wqinvtab[3]);
2139 for (long j = 4; j < half; j+=4) {
2140 inv_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
2141 inv_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
2142 inv_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
2143 inv_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
2144 }
2145 }
2146
2147 //=========
2148
2149
2150
2151 static
new_ifft_short2(umint_t * xp,long yn,long lgN,const new_mod_t & mod)2152 void new_ifft_short2(umint_t* xp, long yn, long lgN, const new_mod_t& mod)
2153
2154 // Implements truncated inverse FFT interface, but with xn==N.
2155 // All computations are done in place.
2156
2157 {
2158 long N = 1L << lgN;
2159
2160 if (yn == N && lgN <= NTL_NEW_FFT_THRESH)
2161 {
2162 // no truncation
2163 new_ifft_base(xp, lgN, mod);
2164 return;
2165 }
2166
2167 // divide-and-conquer algorithm
2168
2169 long half = N >> 1;
2170 mint_t q = mod.q;
2171
2172 if (yn <= half)
2173 {
2174 // X -> 2X
2175 for (long j = 0; j < yn; j++)
2176 xp[j] = LazyDoubleMod4(xp[j], q);
2177 // (X, Y) -> X + Y
2178 for (long j = yn; j < half; j++)
2179 xp[j] = LazyAddMod4(xp[j], xp[j + half], q);
2180
2181 new_ifft_short2(xp, yn, lgN - 1, mod);
2182
2183 // (X, Y) -> X - Y
2184 for (long j = 0; j < yn; j++)
2185 xp[j] = LazySubMod4(xp[j], xp[j + half], q);
2186 }
2187 else
2188 {
2189 umint_t* NTL_RESTRICT xp0 = xp;
2190 umint_t* NTL_RESTRICT xp1 = xp + half;
2191 const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
2192 const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
2193
2194 new_ifft_short1(xp0, half, lgN - 1, mod);
2195
2196 yn -= half;
2197
2198
2199 // (X, Y) -> (2X - Y, w*(X - Y))
2200 for (long j = yn; j < half; j++)
2201 {
2202 umint_t x0 = xp0[j];
2203 umint_t x1 = xp1[j];
2204 umint_t u = LazySubMod4(x0, x1, q);
2205 xp0[j] = LazyAddMod4(x0, u, q);
2206 xp1[j] = LazyMulModPrecon(u, wtab[j], q, wqinvtab[j]);
2207 }
2208
2209 new_ifft_short2(xp1, yn, lgN - 1, mod);
2210
2211 // (X, Y) -> (X + Y/w, X - Y/w)
2212 {
2213 const mint_t* NTL_RESTRICT wtab1 = wtab + half;
2214 const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + half;
2215
2216 // DIRT: assumes yn is a multiple of 4
2217 inv_butterfly0(xp0[0], xp1[0], q);
2218 inv_butterfly_neg(xp0[1], xp1[1], wtab1[-1], q, wqinvtab1[-1]);
2219 inv_butterfly_neg(xp0[2], xp1[2], wtab1[-2], q, wqinvtab1[-2]);
2220 inv_butterfly_neg(xp0[3], xp1[3], wtab1[-3], q, wqinvtab1[-3]);
2221 for (long j = 4; j < yn; j+=4) {
2222 inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
2223 inv_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
2224 inv_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
2225 inv_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
2226 }
2227 }
2228 }
2229 }
2230
2231
2232 //=============================================
2233
2234 // HIGH LEVEL ROUTINES
2235
2236 //=========== FFT without tables ===========
2237
2238
NTL_TLS_GLOBAL_DECL(Vec<umint_t>,AA_store)2239 NTL_TLS_GLOBAL_DECL(Vec<umint_t>, AA_store)
2240
2241 NTL_TLS_GLOBAL_DECL(Vec<FFTVectorPair>, mul_vec)
2242
2243 void new_fft_notab(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
2244 long yn, long xn)
2245
2246 // Performs a high-level FFT. Inputs and outputs are in the range [0,q).
2247 // xn and yn are as described above in the truncated FFT interface.
2248 // Both A and a should point to arrays of size 2^k,
2249 // and should either be the same or not overlap at all.
2250 // This version does not use precomputed tables.
2251
2252 {
2253 mint_t q = info.q;
2254
2255 if (k <= 1) {
2256 if (k == 0) {
2257 A[0] = a[0];
2258 return;
2259 }
2260 if (k == 1) {
2261 mint_t A0 = AddMod(a[0], a[1], q);
2262 mint_t A1 = SubMod(a[0], a[1], q);
2263 A[0] = A0;
2264 A[1] = A1;
2265 return;
2266 }
2267 }
2268
2269 // assume k > 1
2270 const mint_t *root = info.RootTable[0].elts();
2271 mulmod_t qinv = info.qinv;
2272
2273 NTL_TLS_GLOBAL_ACCESS(mul_vec);
2274 ComputeMultipliers(mul_vec, k-1, q, qinv, root);
2275
2276 long n = 1L << k;
2277
2278 const mint_t *wtab[NTL_FFTMaxRoot+1];
2279 for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
2280
2281 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2282 for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
2283
2284 new_mod_t mod;
2285 mod.q = q;
2286 mod.wtab = &wtab[0];
2287 mod.wqinvtab = &wqinvtab[0];
2288
2289 mint_t w = info.RootTable[0][k];
2290 mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
2291
2292 #ifdef NTL_FFT_USEBUF
2293 NTL_TLS_GLOBAL_ACCESS(AA_store);
2294 AA_store.SetLength(1L << k);
2295 umint_t *AA = AA_store.elts();
2296
2297 for (long i = 0; i < xn; i++) AA[i] = a[i];
2298
2299 new_fft_short_notab(AA, yn, xn, k, mod, w, wqinv);
2300
2301 for (long i = 0; i < yn; i++) {
2302 A[i] = LazyReduce1(AA[i], q);
2303 }
2304 #else
2305 umint_t *AA = (umint_t *) A;
2306 if (a != A) for (long i = 0; i < xn; i++) AA[i] = a[i];
2307
2308 new_fft_short_notab(AA, yn, xn, k, mod, w, wqinv);
2309
2310 for (long i = 0; i < yn; i++) {
2311 AA[i] = LazyReduce1(AA[i], q);
2312 }
2313 #endif
2314 }
2315
2316
new_fft_flipped_notab(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info)2317 void new_fft_flipped_notab(mint_t* A, const mint_t* a, long k,
2318 const FFTPrimeInfo& info)
2319
2320 // Performs a high-level FFT. Inputs and outputs are in the range [0,q).
2321 // Both A and a should point to arrays of size 2^k,
2322 // and should either be the same or not overlap at all.
2323 // This version is "flipped" -- it uses inverted roots,
2324 // multiplies by 2^{-k}, and performs no truncations.
2325 // This version does not use precomputed tables.
2326
2327 {
2328 mint_t q = info.q;
2329
2330 if (k <= 1) {
2331 if (k == 0) {
2332 A[0] = a[0];
2333 return;
2334 }
2335 if (k == 1) {
2336 mint_t two_inv = info.TwoInvTable[1];
2337 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
2338 mint_t A0 = AddMod(a[0], a[1], q);
2339 mint_t A1 = SubMod(a[0], a[1], q);
2340 A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
2341 A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
2342 return;
2343 }
2344 }
2345
2346 // assume k > 1
2347 const mint_t *root = info.RootTable[1].elts();
2348 mulmod_t qinv = info.qinv;
2349
2350 NTL_TLS_GLOBAL_ACCESS(mul_vec);
2351 ComputeMultipliers(mul_vec, k-1, q, qinv, root);
2352
2353 long n = 1L << k;
2354
2355 const mint_t *wtab[NTL_FFTMaxRoot+1];
2356 for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
2357
2358 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2359 for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
2360
2361 new_mod_t mod;
2362 mod.q = q;
2363 mod.wtab = &wtab[0];
2364 mod.wqinvtab = &wqinvtab[0];
2365
2366 mint_t w = info.RootTable[1][k];
2367 mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
2368
2369 mint_t two_inv = info.TwoInvTable[k];
2370 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
2371
2372 #ifdef NTL_FFT_USEBUF
2373 NTL_TLS_GLOBAL_ACCESS(AA_store);
2374 AA_store.SetLength(1L << k);
2375 umint_t *AA = AA_store.elts();
2376
2377 for (long i = 0; i < n; i++) AA[i] = a[i];
2378
2379 new_fft_short_notab(AA, n, n, k, mod, w, wqinv);
2380
2381 for (long i = 0; i < n; i++) {
2382 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2383 A[i] = LazyReduce1(tmp, q);
2384 }
2385 #else
2386 umint_t *AA = (umint_t *) A;
2387 if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
2388
2389 new_fft_short_notab(AA, n, n, k, mod, w, wqinv);
2390
2391 for (long i = 0; i < n; i++) {
2392 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2393 AA[i] = LazyReduce1(tmp, q);
2394 }
2395
2396 #endif
2397 }
2398
2399
2400 //=========== Inverse FFT without tables ===========
2401
new_ifft_notab(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info,long yn)2402 void new_ifft_notab(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
2403 long yn)
2404
2405 // Performs a high-level IFFT. Inputs and outputs are in the range [0,q).
2406 // yn==xn are as described above in the truncated FFT interface.
2407 // Both A and a should point to arrays of size 2^k,
2408 // and should either be the same or not overlap at all.
2409 // Multiplies by 2^{-k}.
2410 // This version does not use precomputed tables.
2411
2412 {
2413 mint_t q = info.q;
2414
2415 if (k <= 1) {
2416 if (k == 0) {
2417 A[0] = a[0];
2418 return;
2419 }
2420 if (k == 1) {
2421 mint_t two_inv = info.TwoInvTable[1];
2422 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
2423 mint_t A0 = AddMod(a[0], a[1], q);
2424 mint_t A1 = SubMod(a[0], a[1], q);
2425 A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
2426 A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
2427 return;
2428 }
2429 }
2430
2431 // assume k > 1
2432 const mint_t *root = info.RootTable[0].elts();
2433 mulmod_t qinv = info.qinv;
2434
2435 NTL_TLS_GLOBAL_ACCESS(mul_vec);
2436 ComputeMultipliers(mul_vec, k-1, q, qinv, root);
2437
2438 long n = 1L << k;
2439
2440 const mint_t *wtab[NTL_FFTMaxRoot+1];
2441 for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
2442
2443 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2444 for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
2445
2446 new_mod_t mod;
2447 mod.q = q;
2448 mod.wtab = &wtab[0];
2449 mod.wqinvtab = &wqinvtab[0];
2450
2451
2452 mint_t w = info.RootTable[0][k];
2453 mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
2454
2455 mint_t iw = info.RootTable[1][k];
2456 mulmod_precon_t iwqinv = LazyPrepMulModPrecon(iw, q, info.qinv);
2457
2458 mint_t two_inv = info.TwoInvTable[k];
2459 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
2460
2461 #ifdef NTL_FFT_USEBUF
2462 NTL_TLS_GLOBAL_ACCESS(AA_store);
2463 AA_store.SetLength(1L << k);
2464 umint_t *AA = AA_store.elts();
2465
2466 for (long i = 0; i < yn; i++) AA[i] = a[i];
2467
2468 new_ifft_short1_notab(AA, yn, k, mod, w, wqinv, iw, iwqinv);
2469
2470 for (long i = 0; i < yn; i++) {
2471 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2472 A[i] = LazyReduce1(tmp, q);
2473 }
2474 #else
2475 umint_t *AA = (umint_t *) A;
2476 if (a != A) for (long i = 0; i < yn; i++) AA[i] = a[i];
2477
2478 new_ifft_short1_notab(AA, yn, k, mod, w, wqinv, iw, iwqinv);
2479
2480 for (long i = 0; i < yn; i++) {
2481 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2482 AA[i] = LazyReduce1(tmp, q);
2483 }
2484
2485 #endif
2486 }
2487
2488
new_ifft_flipped_notab(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info)2489 void new_ifft_flipped_notab(mint_t* A, const mint_t* a, long k,
2490 const FFTPrimeInfo& info)
2491
2492 // Performs a high-level IFFT. Inputs and outputs are in the range [0,q).
2493 // Flipped means inverse roots are used an no truncation and
2494 // no multiplication by 2^{-k}.
2495 // Both A and a should point to arrays of size 2^k,
2496 // and should either be the same or not overlap at all.
2497 // This version does not use precomputed tables.
2498
2499 {
2500 mint_t q = info.q;
2501
2502 if (k <= 1) {
2503 if (k == 0) {
2504 A[0] = a[0];
2505 return;
2506 }
2507 if (k == 1) {
2508 mint_t A0 = AddMod(a[0], a[1], q);
2509 mint_t A1 = SubMod(a[0], a[1], q);
2510 A[0] = A0;
2511 A[1] = A1;
2512 return;
2513 }
2514 }
2515
2516 // assume k > 1
2517 const mint_t *root = info.RootTable[1].elts();
2518 mulmod_t qinv = info.qinv;
2519
2520 NTL_TLS_GLOBAL_ACCESS(mul_vec);
2521 ComputeMultipliers(mul_vec, k-1, q, qinv, root);
2522
2523 long n = 1L << k;
2524
2525 const mint_t *wtab[NTL_FFTMaxRoot+1];
2526 for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
2527
2528 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2529 for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
2530
2531 new_mod_t mod;
2532 mod.q = q;
2533 mod.wtab = &wtab[0];
2534 mod.wqinvtab = &wqinvtab[0];
2535
2536 mint_t w = info.RootTable[1][k];
2537 mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
2538
2539 mint_t iw = info.RootTable[0][k];
2540 mulmod_precon_t iwqinv = LazyPrepMulModPrecon(iw, q, info.qinv);
2541
2542 #ifdef NTL_FFT_USEBUF
2543 NTL_TLS_GLOBAL_ACCESS(AA_store);
2544 AA_store.SetLength(1L << k);
2545 umint_t *AA = AA_store.elts();
2546
2547 for (long i = 0; i < n; i++) AA[i] = a[i];
2548
2549
2550 new_ifft_short1_notab(AA, n, k, mod, w, wqinv, iw, iwqinv);
2551
2552 for (long i = 0; i < n; i++) {
2553 umint_t tmp = LazyReduce2(AA[i], q);
2554 A[i] = LazyReduce1(tmp, q);
2555 }
2556 #else
2557 umint_t *AA = (umint_t *) A;
2558 if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
2559
2560 new_ifft_short1_notab(AA, n, k, mod, w, wqinv, iw, iwqinv);
2561
2562 for (long i = 0; i < n; i++) {
2563 umint_t tmp = LazyReduce2(AA[i], q);
2564 AA[i] = LazyReduce1(tmp, q);
2565 }
2566 #endif
2567 }
2568
2569
2570 #ifndef NTL_ENABLE_AVX_FFT
2571
2572 //================ FFT with tables ==============
2573
2574
new_fft(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info,long yn,long xn)2575 void new_fft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
2576 long yn, long xn)
2577
2578 // Performs a high-level FFT. Inputs and outputs are in the range [0,q).
2579 // xn and yn are as described above in the truncated FFT interface.
2580 // Both A and a should point to arrays of size 2^k,
2581 // and should either be the same or not overlap at all.
2582
2583 {
2584 if (!info.bigtab || k > info.bigtab->bound) {
2585 new_fft_notab(A, a, k, info, yn, xn);
2586 return;
2587 }
2588
2589 mint_t q = info.q;
2590
2591 if (k <= 1) {
2592 if (k == 0) {
2593 A[0] = a[0];
2594 return;
2595 }
2596 if (k == 1) {
2597 mint_t A0 = AddMod(a[0], a[1], q);
2598 mint_t A1 = SubMod(a[0], a[1], q);
2599 A[0] = A0;
2600 A[1] = A1;
2601 return;
2602 }
2603 }
2604
2605 // assume k > 1
2606 const mint_t *root = info.RootTable[0].elts();
2607 mulmod_t qinv = info.qinv;
2608 const FFTMultipliers& tab = info.bigtab->MulTab;
2609
2610 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
2611
2612
2613 long n = 1L << k;
2614
2615
2616 const mint_t *wtab[NTL_FFTMaxRoot+1];
2617 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
2618
2619 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2620 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
2621
2622 new_mod_t mod;
2623 mod.q = q;
2624 mod.wtab = &wtab[0];
2625 mod.wqinvtab = &wqinvtab[0];
2626
2627
2628
2629 #ifdef NTL_FFT_USEBUF
2630 NTL_TLS_GLOBAL_ACCESS(AA_store);
2631 AA_store.SetLength(1L << k);
2632 umint_t *AA = AA_store.elts();
2633
2634 for (long i = 0; i < xn; i++) AA[i] = a[i];
2635
2636 new_fft_short(AA, yn, xn, k, mod);
2637
2638 for (long i = 0; i < yn; i++) {
2639 A[i] = LazyReduce1(AA[i], q);
2640 }
2641 #else
2642 umint_t *AA = (umint_t *) A;
2643 if (a != A) for (long i = 0; i < xn; i++) AA[i] = a[i];
2644
2645 new_fft_short(AA, yn, xn, k, mod);
2646
2647 for (long i = 0; i < yn; i++) {
2648 AA[i] = LazyReduce1(AA[i], q);
2649 }
2650 #endif
2651
2652 }
2653
new_fft_flipped(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info)2654 void new_fft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
2655
2656 // Performs a high-level FFT. Inputs and outputs are in the range [0,q).
2657 // Both A and a should point to arrays of size 2^k,
2658 // and should either be the same or not overlap at all.
2659 // This version is "flipped" -- it uses inverted roots,
2660 // multiplies by 2^{-k}, and performs no truncations.
2661
2662 {
2663 if (!info.bigtab || k > info.bigtab->bound) {
2664 new_fft_flipped_notab(A, a, k, info);
2665 return;
2666 }
2667
2668 mint_t q = info.q;
2669
2670 if (k <= 1) {
2671 if (k == 0) {
2672 A[0] = a[0];
2673 return;
2674 }
2675 if (k == 1) {
2676 mint_t two_inv = info.TwoInvTable[1];
2677 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
2678 mint_t A0 = AddMod(a[0], a[1], q);
2679 mint_t A1 = SubMod(a[0], a[1], q);
2680 A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
2681 A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
2682 return;
2683 }
2684 }
2685
2686 // assume k > 1
2687 const mint_t *root = info.RootTable[0].elts();
2688 mulmod_t qinv = info.qinv;
2689 const FFTMultipliers& tab = info.bigtab->MulTab;
2690
2691 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
2692
2693
2694 long n = 1L << k;
2695
2696
2697 const mint_t *wtab[NTL_FFTMaxRoot+1];
2698 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
2699
2700 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2701 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
2702
2703 new_mod_t mod;
2704 mod.q = q;
2705 mod.wtab = &wtab[0];
2706 mod.wqinvtab = &wqinvtab[0];
2707
2708 mint_t two_inv = info.TwoInvTable[k];
2709 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
2710
2711
2712 #ifdef NTL_FFT_USEBUF
2713 NTL_TLS_GLOBAL_ACCESS(AA_store);
2714 AA_store.SetLength(1L << k);
2715 umint_t *AA = AA_store.elts();
2716
2717 for (long i = 0; i < n; i++) AA[i] = a[i];
2718
2719 new_fft_short_flipped(AA, k, mod);
2720
2721 for (long i = 0; i < n; i++) {
2722 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2723 A[i] = LazyReduce1(tmp, q);
2724 }
2725 #else
2726 umint_t *AA = (umint_t *) A;
2727 if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
2728
2729 new_fft_short_flipped(AA, k, mod);
2730
2731 for (long i = 0; i < n; i++) {
2732 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2733 AA[i] = LazyReduce1(tmp, q);
2734 }
2735 #endif
2736 }
2737
2738 //======= Inverse FFT with tables ==============
2739
2740
new_ifft(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info,long yn)2741 void new_ifft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
2742 long yn)
2743
2744 // Performs a high-level IFFT. Inputs and outputs are in the range [0,q).
2745 // yn==xn are as described above in the truncated FFT interface.
2746 // Both A and a should point to arrays of size 2^k,
2747 // and should either be the same or not overlap at all.
2748 // Multiples by 2^{-k}.
2749
2750 {
2751 if (!info.bigtab || k > info.bigtab->bound) {
2752 new_ifft_notab(A, a, k, info, yn);
2753 return;
2754 }
2755
2756 mint_t q = info.q;
2757
2758 if (k <= 1) {
2759 if (k == 0) {
2760 A[0] = a[0];
2761 return;
2762 }
2763 if (k == 1) {
2764 mint_t two_inv = info.TwoInvTable[1];
2765 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
2766 mint_t A0 = AddMod(a[0], a[1], q);
2767 mint_t A1 = SubMod(a[0], a[1], q);
2768 A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
2769 A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
2770 return;
2771 }
2772 }
2773
2774 // assume k > 1
2775 const mint_t *root = info.RootTable[0].elts();
2776 mulmod_t qinv = info.qinv;
2777 const FFTMultipliers& tab = info.bigtab->MulTab;
2778
2779 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
2780
2781
2782 long n = 1L << k;
2783
2784
2785 const mint_t *wtab[NTL_FFTMaxRoot+1];
2786 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
2787
2788 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2789 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
2790
2791 new_mod_t mod;
2792 mod.q = q;
2793 mod.wtab = &wtab[0];
2794 mod.wqinvtab = &wqinvtab[0];
2795
2796 mint_t two_inv = info.TwoInvTable[k];
2797 mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
2798
2799 #ifdef NTL_FFT_USEBUF
2800 NTL_TLS_GLOBAL_ACCESS(AA_store);
2801 AA_store.SetLength(1L << k);
2802 umint_t *AA = AA_store.elts();
2803
2804 for (long i = 0; i < yn; i++) AA[i] = a[i];
2805
2806 new_ifft_short1(AA, yn, k, mod);
2807
2808 for (long i = 0; i < yn; i++) {
2809 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2810 A[i] = LazyReduce1(tmp, q);
2811 }
2812 #else
2813 umint_t *AA = (umint_t *) A;
2814 if (a != A) for (long i = 0; i < yn; i++) AA[i] = a[i];
2815
2816 new_ifft_short1(AA, yn, k, mod);
2817
2818 for (long i = 0; i < yn; i++) {
2819 umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2820 AA[i] = LazyReduce1(tmp, q);
2821 }
2822 #endif
2823 }
2824
2825
new_ifft_flipped(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info)2826 void new_ifft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
2827
2828
2829 // Performs a high-level IFFT. Inputs and outputs are in the range [0,q).
2830 // Flipped means inverse roots are used an no truncation and
2831 // no multiplication by 2^{-k}.
2832 // Both A and a should point to arrays of size 2^k,
2833 // and should either be the same or not overlap at all.
2834
2835
2836 {
2837 if (!info.bigtab || k > info.bigtab->bound) {
2838 new_ifft_flipped_notab(A, a, k, info);
2839 return;
2840 }
2841
2842 mint_t q = info.q;
2843
2844 if (k <= 1) {
2845 if (k == 0) {
2846 A[0] = a[0];
2847 return;
2848 }
2849 if (k == 1) {
2850 mint_t A0 = AddMod(a[0], a[1], q);
2851 mint_t A1 = SubMod(a[0], a[1], q);
2852 A[0] = A0;
2853 A[1] = A1;
2854 return;
2855 }
2856 }
2857
2858 // assume k > 1
2859 const mint_t *root = info.RootTable[0].elts();
2860 mulmod_t qinv = info.qinv;
2861 const FFTMultipliers& tab = info.bigtab->MulTab;
2862
2863 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
2864
2865
2866 long n = 1L << k;
2867
2868
2869 const mint_t *wtab[NTL_FFTMaxRoot+1];
2870 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
2871
2872 const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2873 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
2874
2875 new_mod_t mod;
2876 mod.q = q;
2877 mod.wtab = &wtab[0];
2878 mod.wqinvtab = &wqinvtab[0];
2879
2880
2881 #ifdef NTL_FFT_USEBUF
2882 NTL_TLS_GLOBAL_ACCESS(AA_store);
2883 AA_store.SetLength(1L << k);
2884 umint_t *AA = AA_store.elts();
2885
2886 for (long i = 0; i < n; i++) AA[i] = a[i];
2887
2888 new_ifft_short1_flipped(AA, k, mod);
2889
2890 for (long i = 0; i < n; i++) {
2891 umint_t tmp = LazyReduce2(AA[i], q);
2892 A[i] = LazyReduce1(tmp, q);
2893 }
2894 #else
2895 umint_t *AA = (umint_t *) A;
2896 if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
2897
2898 new_ifft_short1_flipped(AA, k, mod);
2899
2900 for (long i = 0; i < n; i++) {
2901 umint_t tmp = LazyReduce2(AA[i], q);
2902 AA[i] = LazyReduce1(tmp, q);
2903 }
2904 #endif
2905 }
2906
2907 #endif
2908
2909 //===============================================
2910
InitFFTPrimeInfo(FFTPrimeInfo & info,long q,long w,long bigtab_index)2911 void InitFFTPrimeInfo(FFTPrimeInfo& info, long q, long w, long bigtab_index)
2912 {
2913 mulmod_t qinv = PrepMulMod(q);
2914
2915 long mr = CalcMaxRoot(q);
2916
2917 info.q = q;
2918 info.qinv = qinv;
2919 info.qrecip = 1/double(q);
2920 info.zz_p_context = 0;
2921
2922
2923 info.RootTable[0].SetLength(mr+1);
2924 info.RootTable[1].SetLength(mr+1);
2925 info.TwoInvTable.SetLength(mr+1);
2926 info.TwoInvPreconTable.SetLength(mr+1);
2927
2928 long *rt = &info.RootTable[0][0];
2929 long *rit = &info.RootTable[1][0];
2930 long *tit = &info.TwoInvTable[0];
2931 mulmod_precon_t *tipt = &info.TwoInvPreconTable[0];
2932
2933 long j;
2934 long t;
2935
2936 rt[mr] = w;
2937 for (j = mr-1; j >= 0; j--)
2938 rt[j] = MulMod(rt[j+1], rt[j+1], q);
2939
2940 rit[mr] = InvMod(w, q);
2941 for (j = mr-1; j >= 0; j--)
2942 rit[j] = MulMod(rit[j+1], rit[j+1], q);
2943
2944 t = InvMod(2, q);
2945 tit[0] = 1;
2946 for (j = 1; j <= mr; j++)
2947 tit[j] = MulMod(tit[j-1], t, q);
2948
2949 for (j = 0; j <= mr; j++)
2950 tipt[j] = LazyPrepMulModPrecon(tit[j], q, qinv);
2951
2952 #ifndef NTL_ENABLE_AVX_FFT
2953 if (bigtab_index != -1) {
2954 long bound = NTL_FFT_BIGTAB_MAXROOT-bigtab_index/NTL_FFT_BIGTAB_LIMIT;
2955 if (bound > NTL_FFT_BIGTAB_MINROOT) {
2956 info.bigtab.make();
2957 info.bigtab->bound = bound;
2958 }
2959 }
2960 #else
2961 // with the AVX implementation, we unconditionally use tables
2962 info.bigtab.make();
2963 #endif
2964 }
2965
2966
2967 //===================================================================
2968
2969 #ifdef NTL_ENABLE_AVX_FFT
2970
2971 static void
pd_LazyPrepMulModPrecon(double * bninv,const double * b,double n,long len)2972 pd_LazyPrepMulModPrecon(double *bninv, const double *b, double n, long len)
2973 {
2974 CSRPush push;
2975 pd_LazyPrepMulModPrecon_impl(bninv, b, n, len);
2976 }
2977
2978 static
LazyPrecompFFTMultipliers(long k,mint_t q,mulmod_t qinv,const mint_t * root,const pd_FFTMultipliers & tab)2979 void LazyPrecompFFTMultipliers(long k, mint_t q, mulmod_t qinv, const mint_t *root, const pd_FFTMultipliers& tab)
2980 {
2981 if (k < 1) LogicError("LazyPrecompFFTMultipliers: bad input");
2982
2983 do { // NOTE: thread safe lazy init
2984 pd_FFTMultipliers::Builder bld(tab, k+1);
2985 long amt = bld.amt();
2986 if (!amt) break;
2987
2988 long first = k+1-amt;
2989 // initialize entries first..k
2990
2991
2992 for (long s = first; s <= k; s++) {
2993 UniquePtr<pd_FFTVectorPair> item;
2994
2995 if (s == 0) {
2996 bld.move(item); // position 0 not used
2997 continue;
2998 }
2999
3000 long m = 1L << s;
3001 long m_half = 1L << (s-1);
3002
3003 item.make();
3004 item->wtab_precomp.SetLength(m_half);
3005 item->wqinvtab_precomp.SetLength(m_half);
3006
3007 double *wtab = item->wtab_precomp.elts();
3008 double *wqinvtab = item->wqinvtab_precomp.elts();
3009
3010 mint_t w = root[s];
3011 mulmod_precon_t wqinv = PrepMulModPrecon(w, q, qinv);
3012
3013 mint_t wi = 1;
3014 wtab[0] = wi;
3015 for (long i = 1; i < m_half; i++) {
3016 wi = MulModPrecon(wi, w, q, wqinv);
3017 wtab[i] = wi;
3018 }
3019 pd_LazyPrepMulModPrecon(wqinvtab, wtab, q, m_half);
3020
3021 bld.move(item);
3022 }
3023 } while (0);
3024 }
3025
3026 NTL_TLS_GLOBAL_DECL(AlignedArray<double>, pd_AA_store)
3027 static NTL_CHEAP_THREAD_LOCAL long pd_AA_store_len = 0;
3028
3029
3030 #define PD_MIN_K (NTL_LG2_PDSZ+3)
3031 // k must be at least PD_MIN_K
3032
new_fft(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info,long yn,long xn)3033 void new_fft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
3034 long yn, long xn)
3035 {
3036 if (k < PD_MIN_K) {
3037 new_fft_notab(A, a, k, info, yn, xn);
3038 return;
3039 }
3040
3041 long dir = 0;
3042
3043 mint_t q = info.q;
3044 const mint_t *root = info.RootTable[dir].elts();
3045 mulmod_t qinv = info.qinv;
3046 const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[dir];
3047
3048 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
3049
3050 const double *wtab[NTL_FFTMaxRoot+1];
3051 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
3052
3053 const double *wqinvtab[NTL_FFTMaxRoot+1];
3054 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
3055
3056 pd_mod_t mod;
3057 mod.q = q;
3058 mod.wtab = &wtab[0];
3059 mod.wqinvtab = &wqinvtab[0];
3060
3061 long n = 1L << k;
3062
3063 NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
3064 if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
3065 double *AA = pd_AA_store.elts();
3066
3067 CSRPush push;
3068 pd_fft_trunc_impl(A, a, AA, k, mod, yn, xn);
3069 }
3070
3071
3072
new_fft_flipped(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info)3073 void new_fft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
3074 {
3075 if (k < PD_MIN_K) {
3076 new_fft_flipped_notab(A, a, k, info);
3077 return;
3078 }
3079
3080 long dir = 1;
3081
3082 mint_t q = info.q;
3083 const mint_t *root = info.RootTable[dir].elts();
3084 mulmod_t qinv = info.qinv;
3085 const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[dir];
3086
3087 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
3088
3089 const double *wtab[NTL_FFTMaxRoot+1];
3090 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
3091
3092 const double *wqinvtab[NTL_FFTMaxRoot+1];
3093 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
3094
3095 pd_mod_t mod;
3096 mod.q = q;
3097 mod.wtab = &wtab[0];
3098 mod.wqinvtab = &wqinvtab[0];
3099
3100 long n = 1L << k;
3101
3102 NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
3103 if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
3104 double *AA = pd_AA_store.elts();
3105
3106 CSRPush push;
3107 pd_fft_trunc_impl(A, a, AA, k, mod, n, n, info.TwoInvTable[k]);
3108 }
3109
3110
new_ifft(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info,long yn)3111 void new_ifft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
3112 long yn)
3113 {
3114 if (k < PD_MIN_K) {
3115 new_ifft_notab(A, a, k, info, yn);
3116 return;
3117 }
3118
3119 long dir = 0;
3120
3121 mint_t q = info.q;
3122 const mint_t *root = info.RootTable[1-dir].elts();
3123 const mint_t *root1 = info.RootTable[dir].elts();
3124 mulmod_t qinv = info.qinv;
3125 const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[1-dir];
3126 const pd_FFTMultipliers& tab1 = info.bigtab->pd_MulTab[dir];
3127
3128 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
3129 if (k >= tab1.length()) LazyPrecompFFTMultipliers(k, q, qinv, root1, tab1);
3130
3131 const double *wtab[NTL_FFTMaxRoot+1];
3132 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
3133
3134 const double *wqinvtab[NTL_FFTMaxRoot+1];
3135 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
3136
3137 const double *wtab1[NTL_FFTMaxRoot+1];
3138 for (long s = 1; s <= k; s++) wtab1[s] = tab1[s]->wtab_precomp.elts();
3139
3140 const double *wqinvtab1[NTL_FFTMaxRoot+1];
3141 for (long s = 1; s <= k; s++) wqinvtab1[s] = tab1[s]->wqinvtab_precomp.elts();
3142
3143 pd_mod_t mod;
3144 mod.q = q;
3145 mod.wtab = &wtab[0];
3146 mod.wqinvtab = &wqinvtab[0];
3147 mod.wtab1 = &wtab1[0];
3148 mod.wqinvtab1 = &wqinvtab1[0];
3149
3150 long n = 1L << k;
3151
3152 NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
3153 if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
3154 double *AA = pd_AA_store.elts();
3155
3156 CSRPush push;
3157 pd_ifft_trunc_impl(A, a, AA, k, mod, yn, info.TwoInvTable[k]);
3158 }
3159
3160
new_ifft_flipped(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info)3161 void new_ifft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
3162 {
3163 if (k < PD_MIN_K) {
3164 new_ifft_flipped_notab(A, a, k, info);
3165 return;
3166 }
3167
3168 long dir = 1;
3169
3170 mint_t q = info.q;
3171 const mint_t *root = info.RootTable[1-dir].elts();
3172 const mint_t *root1 = info.RootTable[dir].elts();
3173 mulmod_t qinv = info.qinv;
3174 const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[1-dir];
3175 const pd_FFTMultipliers& tab1 = info.bigtab->pd_MulTab[dir];
3176
3177 if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
3178 if (k >= tab1.length()) LazyPrecompFFTMultipliers(k, q, qinv, root1, tab1);
3179
3180 const double *wtab[NTL_FFTMaxRoot+1];
3181 for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
3182
3183 const double *wqinvtab[NTL_FFTMaxRoot+1];
3184 for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
3185
3186 const double *wtab1[NTL_FFTMaxRoot+1];
3187 for (long s = 1; s <= k; s++) wtab1[s] = tab1[s]->wtab_precomp.elts();
3188
3189 const double *wqinvtab1[NTL_FFTMaxRoot+1];
3190 for (long s = 1; s <= k; s++) wqinvtab1[s] = tab1[s]->wqinvtab_precomp.elts();
3191
3192 pd_mod_t mod;
3193 mod.q = q;
3194 mod.wtab = &wtab[0];
3195 mod.wqinvtab = &wqinvtab[0];
3196 mod.wtab1 = &wtab1[0];
3197 mod.wqinvtab1 = &wqinvtab1[0];
3198
3199 long n = 1L << k;
3200
3201 NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
3202 if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
3203 double *AA = pd_AA_store.elts();
3204
3205 CSRPush push;
3206 pd_ifft_trunc_impl(A, a, AA, k, mod, n);
3207 }
3208
3209 #endif
3210
3211
3212 NTL_END_IMPL
3213