1 
2 #include <NTL/FFT.h>
3 #include <NTL/FFT_impl.h>
4 
5 #ifdef NTL_ENABLE_AVX_FFT
6 #include <NTL/SmartPtr.h>
7 #include <NTL/pd_FFT.h>
8 #endif
9 
10 
11 /********************************************************************
12 
13 This is an implementation of a "small prime" FFT, which lies at the heart of
14 ZZ_pX and zz_pX arithmetic, and impacts many other applications as well
15 (such as arithmetic in ZZ_pEX, zz_pEX, and ZZX).
16 
17 The algorithm is a Truncated FFT based on code originally developed by David
18 Harvey.  David's code built on the single-precision modular multiplication
19 technique introduced in NTL many years ago, but also uses a "lazy
20 multiplication" technique, which reduces the number of "correction" steps that
21 need to be performed in each butterfly (see below for more details).  It also
22 implements a version of the Truncated FFT algorithm introduced by Joris van der
23 Hoeven at ISSAC 2004.  Also see "A cache-friendly truncated FFT", David Harvey,
24 Theoretical Computer Science Volume 410, Issues 27-29, 28 June 2009, Pages
25 2649-2658.
26 
27 I have almost completely re-written David's original code to make it fit into
28 NTL's software framework; however, all of the key logic is still based on
29 David's code.  David's original code also implemented a 2D transformation which
30 is more cache friendly for *very* large transforms.  However, my experimens
31 indicated this was only beneficial for transforms of size at least 2^20, and so
32 I did not incorporate this variant.
33 
34 Here is the Copyright notice from David's original code:
35 
36 
37 ==============================================================================
38 
39 fft62: a library for number-theoretic transforms
40 
41 Copyright (C) 2013, David Harvey
42 
43 All rights reserved.
44 
45 Redistribution and use in source and binary forms, with or without
46 modification, are permitted provided that the following conditions are met:
47 
48 * Redistributions of source code must retain the above copyright notice, this
49   list of conditions and the following disclaimer.
50 * Redistributions in binary form must reproduce the above copyright notice,
51   this list of conditions and the following disclaimer in the documentation
52   and/or other materials provided with the distribution.
53 
54 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
55 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
57 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
58 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
60 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
61 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
62 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
63 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64 
65 
66 ==============================================================================
67 
68 
69 SINGLE-PRECISION MODULAR ARITHMETIC
70 
71 The implementation of arithmetic modulo n, where n is a "word sized" integer is
72 critical to the performance of the FFT.  Such word-sized modular arithmetic is
73 used throughout many other parts of NTL, and is a part of the external,
74 documented interface.
75 
76 As NTL was initially built on top of Arjen Lenstra's LIP software, I stole a
77 lot of ideas from LIP.  One very nice ideas was LIP's way of handling
78 single-precision modular arithmetic.  Back in those days (the early 1990's), I
79 was targeting 32-machines, mainly SPARC stations.  LIP's stratgey was to
80 restrict n to 30 bits, and to compute a*b % n, where 0 <= a, b < n, the
81 follwong was computed:
82 
83    long q = long(double(a) * double(b) / double(n));
84    long r = a*b - q*n;
85    if (r >= n)
86       r -= n;
87    else if (r < 0)
88       r += n;
89 
90 With quite reasonable assumptions about floating point (certainly, anything
91 even remotely close to IEEE 64-bit doubles), the computation of q always gives
92 the true quotient floor(a*b / n), plus or minus 1.  The computation of r is
93 done modulo the 2^{word size}, and the following if/then/else adjusts r as
94 necessary.  To be more portable, some of these computations should really be
95 done using unsigned arithmetic, but that is not so important here.  Also, the
96 adjustment steps can be replaced by simple non-branching instrictions sequences
97 involving SHIFT, AND, and ADD/SUB instructions.  On some modern machines, this
98 is usually faster and NTL uses this non-branching strategy.  However, on other
99 machines (modern x86's are  an example of this), conditional move instructions
100 can be used in place of branching, and this code can be faster than the
101 non-branching code.  NTL's performance-tuning script will figure out the best
102 way to do this.
103 
104 
105 Other simple optimizations can be done, such as precomputing 1/double(n) when n
106 remains fixed for many computations, as is often the case.
107 
108 Note also that this strategy works perfectly well even when a or b are larger
109 than n, but the quotient itself is bounded by 2^30.
110 
111 This strategy worked well for many years.  I had considered employing
112 "Montgomery multiplication", but did not do so for a couple of reasons:
113   1) it would require non-portable code, because Montgomery multiplication
114      requires the computation of two-word products,
115   2) I did not like the idea of working with "alternative representations"
116      for integers mod n, as this would make the interfaces more awkward.
117 
118 At some point in the early 2000's, this strategy was starting to slow things
119 down, as floating point arithmetic, especially the integer/floating point
120 conversions, was starting to slow down relative to integer arithmetic.  This
121 was especially true on x86 machines, which by this time was starting to become
122 the most important target.  As it happens, later in the 2000's, as the x86
123 platforms started to use SSE instructions in lieu of the old x87 FPU
124 instructions, this speed differential again became less of a problem.
125 Nevertheless, I introduced some new techniques that speed things up across a
126 variety of platforms.  I introduced this new technique in NTL 5.4 back in 2005.
127 I never claimed it was particularly new, and I never really documented many
128 details about it, but since then, it has come to be known as "Shoup
129 multiplcation" in a few papers, so I'll accept that. :-)  The paper "Faster
130 arithmetic for number-theoretic transforms" [David Harvey, J. Symb. Comp. 60
131 (2014)] seems to be the first place where it is discussed in detail,
132 and Harvey's paper also contains some improvements which I discuss below.
133 
134 The basic idea is that in many computations, not only n, but one of the
135 arguments, say b, remains fixed for many computatations of a*b % n, and so we
136 can afford to do a little precomputation, based on b and n, to speed things up.
137 This approach does require the ability to compute double-word products
138 (actually, just the high word of the product), but it still presents the same
139 basic interface as before (i.e., no awkward, alternative representations);
140 moreover, on platforms where we can't get double-word products, the
141 implementation falls back to the old floating point strategy, and client code
142 need not be aware of this.
143 
144 The basic idea is this: suppose 0 <= n < 2^w, and 0 <= a < 2^w, and 0 <= b < n.
145 We precompute bninv = floor(2^w*b/n).  Then if we compute q =
146 floor(a*bninv/2^w), it can be argued that q is either floor(a*b/n), or is 1 too
147 small.  The computation of bninv can be done using the floating point
148 techniques described above.  The computation of q can be done by computing the
149 high word of a double-word product (it helps if bninv is left-shifted an
150 appropriate amount first).  Once we have q, we can compute a*b - q*n as before,
151 and adjust (but now only one adjustment is needed).  So after the
152 precomputation.  the whole operation takes 3 multiplies (one doube-word and two
153 single-word), and a small handful of simple instructions (adds, shifts, etc).
154 Moreover, two of the three multiplies can start in parallel, on platforms where
155 this is possible.
156 
157 David Harvey noticed that because on modern machines, multiplies are really not
158 that slow compared to additions, the cost of all of the adjustments (in the
159 MulMod, as well as in the AddMod and SubMod's in the basic FFT butterfly steps)
160 starts to dominate the cost of the FFT. Indeed, with a straightforward
161 implementation of the above ideas, there are three multiplies and three
162 adjustment steps in each butterfly step.  David's idea was to work with
163 redundant representations mod n, in the range [0..4*n), and thus reduce the
164 number of adjustments per butterfly from three to one.  I've implemented this
165 idea here, and it does indeed make a significant difference, which is even more
166 pronounced when all of the FFT multipliers b and corresponding bninv values are
167 precomputed.  My initial implementation of David's ideas (v6.0 in 2013) only
168 implemented his approach with these precomputated tables: it seemed that
169 without these tables, it was not a significant improvement.  However, I later
170 figured out how to reduce the cost of computing all the necessary data "on the
171 fly", in a way that seems only slightly (10-15%) slower overall.  I introduced
172 this in v9.1 in 2015, and set things up so that now the pre-computed tables are
173 still used, but not exclusively, in such a way as to reduce the memory used by
174 these tables for very large polynomials (either very high degree or lots of FFT
175 primes).   The idea here is simple, but I haven't seen it discussed elsewhere,
176 so I'll document the basic idea here.
177 
178 Suppose we have the preconditioners for a and b, and want a*b % n along with
179 the preconditioner for a*b % n.
180 
181 For a, let us suppose that we have both q1 and r1, where:
182    2^w*a = n*q1 + r1
183 We can obtain both q1 and r1 using floating point techniques.
184 
185 Step 1. Compute a*b % n, using the integer-only MulMod, using
186 either the preconditioner for either a or b.
187 
188 Step 2. Compute q2 and r2 such that
189    r1*b = n*q2 + r2
190 We can obtain these using the integer-only MulMod, preconditioned on b.
191 Actually, we only need q2, not r2.
192 
193 Step 3. Compute
194    q3 = q1*b + q2 mod 2^w
195 which we can compute with just a single-word multiply and an addition.
196 
197 One can easily show that the value q3 computed above is indeed the
198 preconditioner for a*b % n.
199 
200 Note that, in theory, if the computation in Step 2 is done using the
201 preconditioner for a (i.e., q1), then the multiplication q1*b in Step 3 should
202 not really be necessary (assuming that computing both high and low words of a
203 doube-wprd product is no more expensive than just computing the low word).
204 However, none of the compilers I've used have been able to perform that
205 optimization (in NTL v11.1, I added code that hand-codes this optimization).
206 
207 
208 64-BIT MACHINES
209 
210 Current versions of NTL use (by default) 60-bit moduli based
211 on all-integer arithemtic.
212 
213 
214 Prior to v9.0 of NTL, on 64 bits, the modulus n was restricted to 50 bits, in
215 order to allow the use of double-precision techniques, as double's have 53 bits
216 of precision.  However, NTL now supports 60-bit moduli.  Actually, 62 bits can
217 be supported by setting the NTL_MAXIMIZE_SP_NBITS configuraton flag, but other
218 things (namely, the TBL_REM implementation in lip.cpp) start to slow down if 62
219 bits are used, so 60 seems like a good compromise.  Currently,  60-bit moduli
220 are available only when compiling NTL with GMP, and when some kind of extended
221 integer of floating point arithmetic is available.
222 
223 
224 FUTURE TRENDS
225 
226 
227 * The following papers
228 
229    https://eprint.iacr.org/2017/727
230    https://eprint.iacr.org/2016/504
231    https://eprint.iacr.org/2015/382
232 
233 present FFTs that access the pre-computed tables in a somewhat more efficent
234 fashion, so that we only need to read from the tables O(n) times, rather than
235 O(n log n) times.
236 
237 I've partially implemented this, and have gotten mixed results.
238 For smallish FFT's (below k=10 or 11), this code is somewhat slower.
239 For larger FFT's (say, k=17), I see a speedup of 3-10%.
240 
241 
242 ********************************************************************/
243 
244 
245 
246 #define NTL_FFT_BIGTAB_LIMIT (180)
247 #define NTL_FFT_BIGTAB_MAXROOT (17)
248 #define NTL_FFT_BIGTAB_MINROOT (7)
249 
250 // table sizes are bounded by 2^bound, where
251 // bound = NTL_FFT_BIGTAB_MAXROOT-index/NTL_FFT_BIGTAB_LIMIT.
252 // Here, index is the index of an FFT prime, or 0 for a user FFT prime.
253 // If bound <= NTL_FFT_BIGTAB_MINROOT, then big tables are not used,
254 // so only the first
255 //    (NTL_FFT_BIGTAB_MAXROOT-NTL_FFT_BIGTAB_MINROOT)*NTL_FFT_BIGTAB_LIMIT
256 // FFT primes will have big tables.
257 
258 // NOTE: in newer versions of NTL (v9.1 and later), the BIGTAB
259 // code is only about 5-15% faster than the non-BIGTAB code, so
260 // this is not a great time/space trade-off.
261 // However, some futher optimizations may only be implemented
262 // if big tables are used.
263 
264 // NOTE: NTL_FFT_BIGTAB_MAXROOT is set independently of the parameter
265 // NTL_FFTMaxRoot defined in FFT.h (and which is typically 25).
266 // The space for the LazyTable FFTMultipliers could be reduced a bit
267 // by using min(NTL_FFT_BIGTAB_MAXROOT, NTL_FFTMaxRoot) + 1 for the
268 // size of these tables.
269 
270 
271 
272 NTL_START_IMPL
273 
274 
275 
276 class FFTVectorPair {
277 public:
278    Vec<long> wtab_precomp;
279    Vec<mulmod_precon_t> wqinvtab_precomp;
280 };
281 
282 typedef LazyTable<FFTVectorPair, NTL_FFTMaxRoot+1> FFTMultipliers;
283 
284 
285 #ifdef NTL_ENABLE_AVX_FFT
286 class pd_FFTVectorPair {
287 public:
288    AlignedArray<double> wtab_precomp;
289    AlignedArray<double> wqinvtab_precomp;
290 };
291 
292 typedef LazyTable<pd_FFTVectorPair, NTL_FFTMaxRoot+1> pd_FFTMultipliers;
293 #endif
294 
295 
296 
297 class FFTMulTabs {
298 public:
299 
300 #ifndef NTL_ENABLE_AVX_FFT
301    long bound;
302    FFTMultipliers MulTab;
303 #else
304    pd_FFTMultipliers pd_MulTab[2];
305 #endif
306 
307 };
308 
deleter(FFTMulTabs * p)309 void FFTMulTabsDeleterPolicy::deleter(FFTMulTabs *p) { delete p; }
310 
311 
312 
313 FFTTablesType FFTTables;
314 // a truly GLOBAL variable, shared among all threads
315 
316 
317 
IsFFTPrime(long n,long & w)318 long IsFFTPrime(long n, long& w)
319 {
320    long  m, x, y, z;
321    long j, k;
322 
323 
324    if (n <= 1 || n >= NTL_SP_BOUND) return 0;
325 
326    if (n % 2 == 0) return 0;
327 
328    if (n % 3 == 0) return 0;
329 
330    if (n % 5 == 0) return 0;
331 
332    if (n % 7 == 0) return 0;
333 
334    m = n - 1;
335    k = 0;
336    while ((m & 1) == 0) {
337       m = m >> 1;
338       k++;
339    }
340 
341    for (;;) {
342       x = RandomBnd(n);
343 
344       if (x == 0) continue;
345       z = PowerMod(x, m, n);
346       if (z == 1) continue;
347 
348       x = z;
349       j = 0;
350       do {
351          y = z;
352          z = MulMod(y, y, n);
353          j++;
354       } while (j != k && z != 1);
355 
356       if (z != 1 || y !=  n-1) return 0;
357 
358       if (j == k)
359          break;
360    }
361 
362    /* x^{2^k} = 1 mod n, x^{2^{k-1}} = -1 mod n */
363 
364    long TrialBound;
365 
366    TrialBound = m >> k;
367    if (TrialBound > 0) {
368       if (!ProbPrime(n, 5)) return 0;
369 
370       /* we have to do trial division by special numbers */
371 
372       TrialBound = SqrRoot(TrialBound);
373 
374       long a, b;
375 
376       for (a = 1; a <= TrialBound; a++) {
377          b = (a << k) + 1;
378          if (n % b == 0) return 0;
379       }
380    }
381 
382    /* n is an FFT prime */
383 
384 
385    for (j = NTL_FFTMaxRoot; j < k; j++) {
386       x = MulMod(x, x, n);
387    }
388 
389    w = x;
390 
391    return 1;
392 }
393 
394 
395 static
NextFFTPrime(long & q,long & w,long index)396 void NextFFTPrime(long& q, long& w, long index)
397 {
398    static long m = NTL_FFTMaxRootBnd + 1;
399    static long k = 0;
400    // m and k are truly GLOBAL variables, shared among
401    // all threads.  Access is protected by a critical section
402    // guarding FFTTables
403 
404    static long last_index = -1;
405    static long last_m = 0;
406    static long last_k = 0;
407 
408    if (index == last_index) {
409       // roll back m and k...part of a simple error recovery
410       // strategy if an exception was thrown in the last
411       // invocation of UseFFTPrime...probably of academic
412       // interest only
413 
414       m = last_m;
415       k = last_k;
416    }
417    else {
418       last_index = index;
419       last_m = m;
420       last_k = k;
421    }
422 
423    long t, cand;
424 
425    for (;;) {
426       if (k == 0) {
427          m--;
428          if (m < 5) ResourceError("ran out of FFT primes");
429          k = 1L << (NTL_SP_NBITS-m-2);
430       }
431 
432       k--;
433 
434       cand = (1L << (NTL_SP_NBITS-1)) + (k << (m+1)) + (1L << m) + 1;
435 
436       if (!IsFFTPrime(cand, t)) continue;
437       q = cand;
438       w = t;
439       return;
440    }
441 }
442 
443 
CalcMaxRoot(long p)444 long CalcMaxRoot(long p)
445 {
446    p = p-1;
447    long k = 0;
448    while ((p & 1) == 0) {
449       p = p >> 1;
450       k++;
451    }
452 
453    if (k > NTL_FFTMaxRoot)
454       return NTL_FFTMaxRoot;
455    else
456       return k;
457 }
458 
459 
460 
461 
462 #ifndef NTL_WIZARD_HACK
463 SmartPtr<zz_pInfoT> Build_zz_pInfo(FFTPrimeInfo *info);
464 #else
Build_zz_pInfo(FFTPrimeInfo * info)465 SmartPtr<zz_pInfoT> Build_zz_pInfo(FFTPrimeInfo *info) { return 0; }
466 #endif
467 
UseFFTPrime(long index)468 void UseFFTPrime(long index)
469 {
470    if (index < 0) LogicError("invalud FFT prime index");
471    if (index >= NTL_MAX_FFTPRIMES) ResourceError("FFT prime index too large");
472 
473    if (index+1 >= NTL_NSP_BOUND) ResourceError("FFT prime index too large");
474    // largely acacedemic, but it is a convenient assumption
475 
476    do {  // NOTE: thread safe lazy init
477       FFTTablesType::Builder bld(FFTTables, index+1);
478       long amt = bld.amt();
479       if (!amt) break;
480 
481       long first = index+1-amt;
482       // initialize entries first..index
483 
484       long i;
485       for (i = first; i <= index; i++) {
486          UniquePtr<FFTPrimeInfo> info;
487          info.make();
488 
489          long q, w;
490          NextFFTPrime(q, w, i);
491 
492          long bigtab_index = -1;
493 
494 #ifdef NTL_FFT_BIGTAB
495          bigtab_index = i;
496 #endif
497 
498          InitFFTPrimeInfo(*info, q, w, bigtab_index);
499          info->zz_p_context = Build_zz_pInfo(info.get());
500          bld.move(info);
501       }
502 
503    } while (0);
504 }
505 
506 
507 #ifdef NTL_FFT_LAZYMUL
508 // we only honor the FFT_LAZYMUL flag if either the SPMM_ULL_VIABLE or LONGLONG_SP_MULMOD
509 // flags are set
510 
511 #if (!defined(NTL_SPMM_ULL_VIABLE) && !defined(NTL_LONGLONG_SP_MULMOD))
512 #undef NTL_FFT_LAZYMUL
513 
514 // raise an error if running the wizard
515 #if (defined(NTL_WIZARD_HACK))
516 #error "cannot honor NTL_FFT_LAZYMUL"
517 #endif
518 
519 #endif
520 
521 #endif
522 
523 
524 
525 
526 #ifdef NTL_FFT_LAZYMUL
527 // FFT with  lazy multiplication
528 
529 #ifdef NTL_CLEAN_INT
530 #define NTL_FFT_USEBUF
531 #endif
532 // DIRT: with the lazy multiplication strategy, we have to work
533 // with unisgned long's rather than long's.  To avoid unnecessary
534 // copying, we simply cast long* to unsigned long*.
535 // Is this standards compliant? Does it evoke Undefined Behavior?
536 // The C++ standard before C++14 were actually somewhat inconsistent
537 // on this point.
538 
539 // In all versions of the C++ and C standards, the "strict aliasing"
540 // rules [basic.lval] have always said that signed/unsigned can
541 // always alias each other.  So this does not break the strict
542 // aliasing rules.  However, prior to C++14, the section
543 // on Lvalue-to-rvalue conversion [conv.lval] said that
544 // this was actually UB.  This has been cleared up in C++14,
545 // where now it is no longer UB.  Actally, it seems that the change
546 // to C++14 was cleaning up an inconsistency in the standard
547 // itself, and not really a change in the language definition.
548 
549 // In practice, it does make a significant difference in performance
550 // to avoid all these copies, so the default is avoid them.
551 
552 // See: https://stackoverflow.com/questions/30048135/efficient-way-to-bit-copy-a-signed-integer-to-an-unsigned-integer
553 
554 // See: https://stackoverflow.com/questions/27109701/aliasing-of-otherwise-equivalent-signed-and-unsigned-types
555 // Especially comments by Columbo regarding N3797 and [conv.lval]
556 
557 
558 
559 
560 
561 
562 #if (defined(NTL_LONGLONG_SP_MULMOD))
563 
564 
565 #if (NTL_BITS_PER_LONG >= NTL_SP_NBITS+4)
566 
567 static inline unsigned long
sp_NormalizedLazyPrepMulModPreconWithRem(unsigned long & rres,long b,long n,unsigned long ninv)568 sp_NormalizedLazyPrepMulModPreconWithRem(unsigned long& rres, long b, long n, unsigned long ninv)
569 {
570    unsigned long H = cast_unsigned(b);
571    unsigned long Q = ll_mul_hi(H << 4, ninv);
572    unsigned long L = cast_unsigned(b) << (NTL_SP_NBITS+2);
573    long r = L - Q*cast_unsigned(n);  // r in [0..2*n)
574 
575    r = sp_CorrectExcessQuo(Q, r, n);
576    rres = r;
577    return Q; // NOTE: not shifted
578 }
579 
580 static inline unsigned long
sp_NormalizedLazyPrepMulModPrecon(long b,long n,unsigned long ninv)581 sp_NormalizedLazyPrepMulModPrecon(long b, long n, unsigned long ninv)
582 {
583    unsigned long H = cast_unsigned(b);
584    unsigned long Q = ll_mul_hi(H << 4, ninv);
585    unsigned long L = cast_unsigned(b) << (NTL_SP_NBITS+2);
586    long r = L - Q*cast_unsigned(n);  // r in [0..2*n)
587 
588    Q += 1L + sp_SignMask(r-n);
589    return Q; // NOTE: not shifted
590 }
591 
592 
593 #else
594 
595 // NTL_BITS_PER_LONG == NTL_SP_NBITS+2
596 static inline unsigned long
sp_NormalizedLazyPrepMulModPreconWithRem(unsigned long & rres,long b,long n,unsigned long ninv)597 sp_NormalizedLazyPrepMulModPreconWithRem(unsigned long& rres, long b, long n, unsigned long ninv)
598 {
599    unsigned long H = cast_unsigned(b) << 2;
600    unsigned long Q = ll_mul_hi(H, (ninv << 1)) + H;
601    unsigned long rr = -Q*cast_unsigned(n);  // r in [0..3*n)
602 
603    long r = sp_CorrectExcessQuo(Q, rr, n);
604    r = sp_CorrectExcessQuo(Q, r, n);
605    rres = r;
606    return Q;  // NOTE: not shifted
607 }
608 
609 static inline unsigned long
sp_NormalizedLazyPrepMulModPrecon(long b,long n,unsigned long ninv)610 sp_NormalizedLazyPrepMulModPrecon(long b, long n, unsigned long ninv)
611 {
612    unsigned long H = cast_unsigned(b) << 2;
613    unsigned long Q = ll_mul_hi(H, (ninv << 1)) + H;
614    unsigned long rr = -Q*cast_unsigned(n);  // r in [0..3*n)
615    Q += 2L + sp_SignMask(rr-n) + sp_SignMask(rr-2*n);
616    return Q; // NOTE: not shifted
617 }
618 
619 
620 #endif
621 
622 
623 static inline unsigned long
LazyPrepMulModPrecon(long b,long n,sp_inverse ninv)624 LazyPrepMulModPrecon(long b, long n, sp_inverse ninv)
625 {
626    return sp_NormalizedLazyPrepMulModPrecon(b << ninv.shamt, n << ninv.shamt, ninv.inv) << (NTL_BITS_PER_LONG-NTL_SP_NBITS-2);
627 }
628 
629 
630 static inline unsigned long
LazyPrepMulModPreconWithRem(unsigned long & rres,long b,long n,sp_inverse ninv)631 LazyPrepMulModPreconWithRem(unsigned long& rres, long b, long n, sp_inverse ninv)
632 {
633    unsigned long qq, rr;
634    qq = sp_NormalizedLazyPrepMulModPreconWithRem(rr, b << ninv.shamt, n << ninv.shamt, ninv.inv);
635    rres = rr >> ninv.shamt;
636    return qq << (NTL_BITS_PER_LONG-NTL_SP_NBITS-2);
637 }
638 
639 
640 
641 
642 
643 
644 
645 
646 #elif (NTL_BITS_PER_LONG - NTL_SP_NBITS >= 4 && NTL_WIDE_DOUBLE_PRECISION - NTL_SP_NBITS >= 4)
647 
648 
649 // slightly faster functions, which should kick in on x86-64, where
650 //    NTL_BITS_PER_LONG == 64
651 //    NTL_SP_NBITS == 60 (another reason for holding this back to 60 bits)
652 //    NTL_WIDE_DOUBLE_PRECISION == 64
653 
654 // DIRT: if the relative error in floating point calcuations (muls and reciprocals)
655 //   is <= epsilon, the relative error in the calculations is <= 3*epsilon +
656 //   O(epsilon^2), and we require that this relative error is at most
657 //   2^{-(NTL_SP_NBITS+2)}, so it should be pretty safe as long as
658 //   epsilon is at most, or not much geater than, 2^{-NTL_WIDE_DOUBLE_PRECISION}.
659 
660 static inline
LazyPrepMulModPrecon(long b,long n,wide_double ninv)661 unsigned long LazyPrepMulModPrecon(long b, long n, wide_double ninv)
662 {
663    long q = (long) ( (((wide_double) b) * wide_double(4*NTL_SP_BOUND)) * ninv );
664 
665    unsigned long rr = (cast_unsigned(b) << (NTL_SP_NBITS+2))
666                        - cast_unsigned(q)*cast_unsigned(n);
667 
668    q += sp_SignMask(rr) + sp_SignMask(rr-n) + 1L;
669 
670    return cast_unsigned(q) << (NTL_BITS_PER_LONG - NTL_SP_NBITS - 2);
671 }
672 
673 static inline
LazyPrepMulModPreconWithRem(unsigned long & rres,long b,long n,wide_double ninv)674 unsigned long LazyPrepMulModPreconWithRem(unsigned long& rres, long b, long n, wide_double ninv)
675 {
676    long q = (long) ( (((wide_double) b) * wide_double(4*NTL_SP_BOUND)) * ninv );
677 
678    unsigned long rr = (cast_unsigned(b) << (NTL_SP_NBITS+2))
679                        - cast_unsigned(q)*cast_unsigned(n);
680 
681    long r = sp_CorrectDeficitQuo(q, rr, n);
682    r = sp_CorrectExcessQuo(q, r, n);
683 
684    unsigned long qres = cast_unsigned(q) << (NTL_BITS_PER_LONG - NTL_SP_NBITS - 2);
685    rres = r;
686    return qres;
687 }
688 
689 #else
690 
691 
692 static inline
LazyPrepMulModPrecon(long b,long n,wide_double ninv)693 unsigned long LazyPrepMulModPrecon(long b, long n, wide_double ninv)
694 {
695    long q = (long) ( (((wide_double) b) * wide_double(NTL_SP_BOUND)) * ninv );
696 
697    unsigned long rr = (cast_unsigned(b) << (NTL_SP_NBITS))
698                        - cast_unsigned(q)*cast_unsigned(n);
699 
700    long r = sp_CorrectDeficitQuo(q, rr, n);
701    r = sp_CorrectExcessQuo(q, r, n);
702 
703    unsigned long qq = q;
704 
705    qq = 2*qq;
706    r = 2*r;
707    r = sp_CorrectExcessQuo(qq, r, n);
708 
709    qq = 2*qq;
710    r = 2*r;
711    qq += sp_SignMask(r-n) + 1L;
712 
713    return qq << (NTL_BITS_PER_LONG - NTL_SP_NBITS - 2);
714 }
715 
716 
717 
718 
719 
720 static inline
LazyPrepMulModPreconWithRem(unsigned long & rres,long b,long n,wide_double ninv)721 unsigned long LazyPrepMulModPreconWithRem(unsigned long& rres, long b, long n, wide_double ninv)
722 {
723    long q = (long) ( (((wide_double) b) * wide_double(NTL_SP_BOUND)) * ninv );
724 
725    unsigned long rr = (cast_unsigned(b) << (NTL_SP_NBITS))
726                        - cast_unsigned(q)*cast_unsigned(n);
727 
728    long r = sp_CorrectDeficitQuo(q, rr, n);
729    r = sp_CorrectExcessQuo(q, r, n);
730 
731    unsigned long qq = q;
732 
733    qq = 2*qq;
734    r = 2*r;
735    r = sp_CorrectExcessQuo(qq, r, n);
736 
737    qq = 2*qq;
738    r = 2*r;
739    r = sp_CorrectExcessQuo(qq, r, n);
740 
741    rres = r;
742    return qq << (NTL_BITS_PER_LONG - NTL_SP_NBITS - 2);
743 }
744 
745 #endif
746 
747 
748 
749 static inline
LazyMulModPreconQuo(unsigned long a,unsigned long b,unsigned long n,unsigned long bninv)750 unsigned long LazyMulModPreconQuo(unsigned long a, unsigned long b,
751                                   unsigned long n, unsigned long bninv)
752 {
753    unsigned long q = ll_mul_hi(a, bninv);
754    unsigned long r = a*b - q*n;
755    q += sp_SignMask(r-n) + 1L;
756    return q << (NTL_BITS_PER_LONG - NTL_SP_NBITS - 2);
757 }
758 
759 
760 static inline
LazyMulModPrecon(unsigned long a,unsigned long b,unsigned long n,unsigned long bninv)761 unsigned long LazyMulModPrecon(unsigned long a, unsigned long b,
762                                unsigned long n, unsigned long bninv)
763 {
764    unsigned long q = ll_mul_hi(a, bninv);
765    unsigned long res = a*b - q*n;
766    return res;
767 }
768 
769 
770 typedef long mint_t;
771 typedef unsigned long umint_t;
772 // For readability and to make it easier to adapt this
773 // code to other settings
774 
775 static inline
LazyReduce1(umint_t a,mint_t q)776 umint_t LazyReduce1(umint_t a, mint_t q)
777 {
778   return sp_CorrectExcess(mint_t(a), q);
779 }
780 
781 static inline
LazyReduce2(umint_t a,mint_t q)782 umint_t LazyReduce2(umint_t a, mint_t q)
783 {
784   return sp_CorrectExcess(a, 2*q);
785 }
786 
787 
788 // inputs in [0, 2*n), output in [0, 4*n)
789 static inline
LazyAddMod(umint_t a,umint_t b,mint_t n)790 umint_t LazyAddMod(umint_t a, umint_t b, mint_t n)
791 {
792    return a+b;
793 }
794 
795 // inputs in [0, 2*n), output in [0, 4*n)
796 static inline
LazySubMod(umint_t a,umint_t b,mint_t n)797 umint_t LazySubMod(umint_t a, umint_t b, mint_t n)
798 {
799    return a-b+2*n;
800 }
801 
802 // inputs in [0, 2*n), output in [0, 2*n)
803 static inline
LazyAddMod2(umint_t a,umint_t b,mint_t n)804 umint_t LazyAddMod2(umint_t a, umint_t b, mint_t n)
805 {
806    umint_t r = a+b;
807    return sp_CorrectExcess(r, 2*n);
808 }
809 
810 // inputs in [0, 2*n), output in [0, 2*n)
811 static inline
LazySubMod2(umint_t a,umint_t b,mint_t n)812 umint_t LazySubMod2(umint_t a, umint_t b, mint_t n)
813 {
814    umint_t r = a-b;
815    return sp_CorrectDeficit(r, 2*n);
816 }
817 
818 #ifdef NTL_AVOID_BRANCHING
819 
820 // x, y in [0, 4*m)
821 // returns x + y mod 4*m, in [0, 4*m)
822 inline static umint_t
LazyAddMod4(umint_t x,umint_t y,mint_t m)823 LazyAddMod4(umint_t x, umint_t y, mint_t m)
824 {
825    x = LazyReduce2(x, m);
826    y = LazyReduce2(y, m);
827    return x+y;
828 }
829 
830 // x, y in [0, 4*m)
831 // returns x - y mod 4*m, in [0, 4*m)
832 inline static umint_t
LazySubMod4(umint_t x,umint_t y,mint_t m)833 LazySubMod4(umint_t x, umint_t y, mint_t m)
834 {
835    x = LazyReduce2(x, m);
836    y = LazyReduce2(y, m);
837    return x-y+2*m;
838 }
839 
840 #else
841 
842 static inline umint_t
LazyAddMod4(umint_t x,umint_t y,umint_t m)843 LazyAddMod4(umint_t x, umint_t y, umint_t m)
844 {
845   y = 4*m - y;
846   umint_t z = x - y;
847   z += (x < y) ? 4*m : 0;
848   return z;
849 }
850 
851 
852 static inline umint_t
LazySubMod4(umint_t x,umint_t y,umint_t m)853 LazySubMod4(umint_t x, umint_t y, umint_t m)
854 {
855   umint_t z = x - y;
856   z += (x < y) ? 4*m : 0;
857   return z;
858 }
859 
860 #endif
861 
862 // Input and output in [0, 4*n)
863 static inline umint_t
LazyDoubleMod4(umint_t a,mint_t n)864 LazyDoubleMod4(umint_t a, mint_t n)
865 {
866    return 2 * LazyReduce2(a, n);
867 }
868 
869 // Input and output in [0, 2*n)
870 static inline umint_t
LazyDoubleMod2(umint_t a,mint_t n)871 LazyDoubleMod2(umint_t a, mint_t n)
872 {
873    return 2 * LazyReduce1(a, n);
874 }
875 
ComputeMultipliers(Vec<FFTVectorPair> & v,long k,mint_t q,mulmod_t qinv,const mint_t * root)876 void ComputeMultipliers(Vec<FFTVectorPair>& v, long k, mint_t q, mulmod_t qinv, const mint_t* root)
877 {
878 
879    long old_len = v.length();
880    v.SetLength(k+1);
881 
882    for (long s = max(old_len, 1); s <= k; s++) {
883       v[s].wtab_precomp.SetLength(1L << (s-1));
884       v[s].wqinvtab_precomp.SetLength(1L << (s-1));
885    }
886 
887    if (k >= 1) {
888       v[1].wtab_precomp[0] = 1;
889       v[1].wqinvtab_precomp[0] = LazyPrepMulModPrecon(1, q, qinv);
890    }
891 
892    if (k >= 2) {
893       v[2].wtab_precomp[0] = v[1].wtab_precomp[0];
894       v[2].wtab_precomp[1] = root[2];
895       v[2].wqinvtab_precomp[0] = v[1].wqinvtab_precomp[0];
896       v[2].wqinvtab_precomp[1] = LazyPrepMulModPrecon(root[2], q, qinv);
897    }
898 
899    for (long s = 3; s <= k; s++) {
900       long m = 1L << s;
901       long m_half = 1L << (s-1);
902       long m_fourth = 1L << (s-2);
903       mint_t* NTL_RESTRICT wtab = v[s].wtab_precomp.elts();
904       mint_t* NTL_RESTRICT wtab1 = v[s-1].wtab_precomp.elts();
905       mulmod_precon_t* NTL_RESTRICT wqinvtab = v[s].wqinvtab_precomp.elts();
906       mulmod_precon_t* NTL_RESTRICT wqinvtab1 = v[s-1].wqinvtab_precomp.elts();
907 
908       mint_t w = root[s];
909       umint_t wqinv_rem;
910       mulmod_precon_t wqinv = LazyPrepMulModPreconWithRem(wqinv_rem, w, q, qinv);
911 
912 
913       for (long i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
914          mint_t w_j = wtab1[j];
915          mulmod_precon_t wqi_j = wqinvtab1[j];
916 
917 #if 0
918          mint_t w_i = LazyReduce1(LazyMulModPrecon(w_j, w, q, wqinv), q);
919          mulmod_precon_t wqi_i = LazyMulModPreconQuo(wqinv_rem, w_j, q, wqi_j)
920                                    + cast_unsigned(w_j)*wqinv;
921 #else
922          // This code sequence makes sure the compiler sees
923          // that the product w_j*wqinv needs to be computed just once
924          ll_type x;
925          ll_mul(x, w_j, wqinv);
926          umint_t hi = ll_get_hi(x);
927          umint_t lo = ll_get_lo(x);
928          umint_t r = cast_unsigned(w_j)*cast_unsigned(w) - hi*cast_unsigned(q);
929 
930          mint_t w_i = LazyReduce1(r, q);
931          mulmod_precon_t wqi_i = lo+LazyMulModPreconQuo(wqinv_rem, w_j, q, wqi_j);
932 #endif
933 
934          wtab[i-1] = w_j;
935          wqinvtab[i-1] = wqi_j;
936          wtab[i] = w_i;
937          wqinvtab[i] = wqi_i;
938       }
939    }
940 
941 #if 0
942    // verify result
943    for (long s = 1; s <= k; s++) {
944       mint_t *wtab = v[s].wtab_precomp.elts();
945       mulmod_precon_t *wqinvtab = v[s].wqinvtab_precomp.elts();
946       long m_half = 1L << (s-1);
947 
948       mint_t w = root[s];
949       mint_t w_i = 1;
950       for (long i = 0; i < m_half; i++) {
951          if (wtab[i] != w_i || wqinvtab[i] != LazyPrepMulModPrecon(w_i, q, qinv))
952             Error("bad table entry");
953          w_i = MulMod(w_i, w, q, qinv);
954       }
955    }
956 #endif
957 }
958 
959 
960 #else
961 
962 
963 // Hacks to make the LAZY code work with ordinary modular arithmetic
964 
965 typedef long mint_t;
966 typedef long umint_t;
967 
IdentityMod(mint_t a,mint_t q)968 static inline mint_t IdentityMod(mint_t a, mint_t q) { return a; }
DoubleMod(mint_t a,mint_t q)969 static inline mint_t DoubleMod(mint_t a, mint_t q) { return AddMod(a, a, q); }
970 
971 #define LazyPrepMulModPrecon PrepMulModPrecon
972 #define LazyMulModPrecon MulModPrecon
973 
974 #define LazyReduce1 IdentityMod
975 #define LazyReduce2 IdentityMod
976 #define LazyAddMod AddMod
977 #define LazySubMod SubMod
978 #define LazyAddMod2 AddMod
979 #define LazySubMod2 SubMod
980 #define LazyAddMod4 AddMod
981 #define LazySubMod4 SubMod
982 #define LazyDoubleMod2 DoubleMod
983 #define LazyDoubleMod4 DoubleMod
984 
985 
ComputeMultipliers(Vec<FFTVectorPair> & v,long k,mint_t q,mulmod_t qinv,const mint_t * root)986 void ComputeMultipliers(Vec<FFTVectorPair>& v, long k, mint_t q, mulmod_t qinv, const mint_t* root)
987 {
988 
989    long old_len = v.length();
990    v.SetLength(k+1);
991 
992    for (long s = max(old_len, 1); s <= k; s++) {
993       v[s].wtab_precomp.SetLength(1L << (s-1));
994       v[s].wqinvtab_precomp.SetLength(1L << (s-1));
995    }
996 
997    if (k >= 1) {
998       v[1].wtab_precomp[0] = 1;
999       v[1].wqinvtab_precomp[0] = PrepMulModPrecon(1, q, qinv);
1000    }
1001 
1002    if (k >= 2) {
1003       v[2].wtab_precomp[0] = v[1].wtab_precomp[0];
1004       v[2].wtab_precomp[1] = root[2];
1005       v[2].wqinvtab_precomp[0] = v[1].wqinvtab_precomp[0];
1006       v[2].wqinvtab_precomp[1] = PrepMulModPrecon(root[2], q, qinv);
1007    }
1008 
1009    for (long s = 3; s <= k; s++) {
1010       long m = 1L << s;
1011       long m_half = 1L << (s-1);
1012       long m_fourth = 1L << (s-2);
1013       mint_t* NTL_RESTRICT wtab = v[s].wtab_precomp.elts();
1014       mint_t* NTL_RESTRICT wtab1 = v[s-1].wtab_precomp.elts();
1015       mulmod_precon_t* NTL_RESTRICT wqinvtab = v[s].wqinvtab_precomp.elts();
1016       mulmod_precon_t* NTL_RESTRICT wqinvtab1 = v[s-1].wqinvtab_precomp.elts();
1017 
1018       mint_t w = root[s];
1019       mulmod_precon_t wqinv = PrepMulModPrecon(w, q, qinv);
1020 
1021 
1022       for (long i = m_half-1, j = m_fourth-1; i >= 0; i -= 2, j--) {
1023          mint_t w_j = wtab1[j];
1024          mulmod_precon_t wqi_j = wqinvtab1[j];
1025 
1026          mint_t w_i = MulModPrecon(w_j, w, q, wqinv);
1027          mulmod_precon_t wqi_i = PrepMulModPrecon(w_i, q, qinv);
1028 
1029          wtab[i-1] = w_j;
1030          wqinvtab[i-1] = wqi_j;
1031          wtab[i] = w_i;
1032          wqinvtab[i] = wqi_i;
1033       }
1034    }
1035 
1036 #if 0
1037    // verify result
1038    for (long s = 1; s <= k; s++) {
1039       mint_t *wtab = v[s].wtab_precomp.elts();
1040       mulmod_precon_t *wqinvtab = v[s].wqinvtab_precomp.elts();
1041       long m_half = 1L << (s-1);
1042 
1043       mint_t w = root[s];
1044       mint_t w_i = 1;
1045       for (long i = 0; i < m_half; i++) {
1046          if (wtab[i] != w_i || wqinvtab[i] != PrepMulModPrecon(w_i, q, qinv))
1047             Error("bad table entry");
1048          w_i = MulMod(w_i, w, q, qinv);
1049       }
1050    }
1051 #endif
1052 }
1053 
1054 #endif
1055 
1056 
1057 
1058 static
LazyPrecompFFTMultipliers(long k,mint_t q,mulmod_t qinv,const mint_t * root,const FFTMultipliers & tab)1059 void LazyPrecompFFTMultipliers(long k, mint_t q, mulmod_t qinv, const mint_t *root, const FFTMultipliers& tab)
1060 {
1061    if (k < 1) LogicError("LazyPrecompFFTMultipliers: bad input");
1062 
1063    do { // NOTE: thread safe lazy init
1064       FFTMultipliers::Builder bld(tab, k+1);
1065       long amt = bld.amt();
1066       if (!amt) break;
1067 
1068       long first = k+1-amt;
1069       // initialize entries first..k
1070 
1071 
1072       for (long s = first; s <= k; s++) {
1073          UniquePtr<FFTVectorPair> item;
1074 
1075          if (s == 0) {
1076             bld.move(item); // position 0 not used
1077             continue;
1078          }
1079 
1080          if (s == 1) {
1081             item.make();
1082             item->wtab_precomp.SetLength(1);
1083             item->wqinvtab_precomp.SetLength(1);
1084             item->wtab_precomp[0] = 1;
1085             item->wqinvtab_precomp[0] = LazyPrepMulModPrecon(1, q, qinv);
1086             bld.move(item);
1087             continue;
1088          }
1089 
1090          item.make();
1091          item->wtab_precomp.SetLength(1L << (s-1));
1092          item->wqinvtab_precomp.SetLength(1L << (s-1));
1093 
1094          long m = 1L << s;
1095          long m_half = 1L << (s-1);
1096          long m_fourth = 1L << (s-2);
1097 
1098          const mint_t *wtab_last = tab[s-1]->wtab_precomp.elts();
1099          const mulmod_precon_t *wqinvtab_last = tab[s-1]->wqinvtab_precomp.elts();
1100 
1101          mint_t *wtab = item->wtab_precomp.elts();
1102          mulmod_precon_t *wqinvtab = item->wqinvtab_precomp.elts();
1103 
1104          for (long i = 0; i < m_fourth; i++) {
1105             wtab[i] = wtab_last[i];
1106             wqinvtab[i] = wqinvtab_last[i];
1107          }
1108 
1109          mint_t w = root[s];
1110          mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, qinv);
1111 
1112          // prepare wtab...
1113 
1114          if (s == 2) {
1115             wtab[1] = LazyReduce1(LazyMulModPrecon(wtab[0], w, q, wqinv), q);
1116             wqinvtab[1] = LazyPrepMulModPrecon(wtab[1], q, qinv);
1117          }
1118          else {
1119             long i, j;
1120 
1121             i = m_half-1; j = m_fourth-1;
1122             wtab[i-1] = wtab[j];
1123             wqinvtab[i-1] = wqinvtab[j];
1124             wtab[i] = LazyReduce1(LazyMulModPrecon(wtab[i-1], w, q, wqinv), q);
1125 
1126             i -= 2; j --;
1127 
1128             for (; i >= 0; i -= 2, j --) {
1129                mint_t wp2 = wtab[i+2];
1130                mint_t wm1 = wtab[j];
1131                wqinvtab[i+2] = LazyPrepMulModPrecon(wp2, q, qinv);
1132                wtab[i-1] = wm1;
1133                wqinvtab[i-1] = wqinvtab[j];
1134                wtab[i] = LazyReduce1(LazyMulModPrecon(wm1, w, q, wqinv), q);
1135             }
1136 
1137             wqinvtab[1] = LazyPrepMulModPrecon(wtab[1], q, qinv);
1138          }
1139 
1140          bld.move(item);
1141       }
1142    } while (0);
1143 }
1144 
1145 
1146 //===================================================================
1147 
1148 // TRUNCATED FFT
1149 
1150 // This code is derived from code originally developed
1151 // by David Harvey.  I include his original documentation,
1152 // annotated appropriately to highlight differences in
1153 // the implemebtation (see NOTEs).
1154 
1155 /*
1156   The DFT is defined as follows.
1157 
1158   Let the input sequence be a_0, ..., a_{N-1}.
1159 
1160   Let w = standard primitive N-th root of 1, i.e. w = g^(2^FFT62_MAX_LGN / N),
1161   where g = some fixed element of Z/pZ of order 2^FFT62_MAX_LGN.
1162 
1163   Let Z = an element of (Z/pZ)^* (twisting parameter).
1164 
1165   Then the output sequence is
1166     b_j = \sum_{0 <= i < N} Z^i a_i w^(ij'), for 0 <= j < N,
1167   where j' is the length-lgN bit-reversal of j.
1168 
1169   Some of the FFT routines can operate on truncated sequences of certain
1170   "admissible" sizes. A size parameter n is admissible if 1 <= n <= N, and n is
1171   divisible by a certain power of 2. The precise power depends on the recursive
1172   array decomposition of the FFT. The smallest admissible n' >= n can be
1173   obtained via fft62_next_size().
1174 */
1175 
1176 // NOTE: the twising parameter is not implemented.
1177 // NOTE: the next admissible size function is called FFTRoundUp,
1178 //   and is defined in FFT.h.
1179 
1180 
1181 /*
1182   Truncated FFT interface is as follows:
1183 
1184   xn and yn must be admissible sizes for N.
1185 
1186   Input in xp[] is a_0, a_1, ..., a_{xn-1}. Assumes a_i = 0 for xn <= i < N.
1187 
1188   Output in yp[] is b_0, ..., b_{yn-1}, i.e. only first yn outputs are computed.
1189 
1190   Twisting parameter Z is described by z and lgH. If z == 0, then Z = basic
1191   2^lgH-th root of 1, and must have lgH >= lgN + 1. If z != 0, then Z = z
1192   (and lgH is ignored).
1193 
1194   The buffers {xp,xn} and {yp,yn} may overlap, but only if xp == yp.
1195 
1196   Inputs are in [0, 2p), outputs are in [0, 2p).
1197 
1198   threads = number of OpenMP threads to use.
1199 */
1200 
1201 
1202 
1203 /*
1204   Inverse truncated FFT interface is as follows.
1205 
1206   xn and yn must be admissible sizes for N, with yn <= xn.
1207 
1208   Input in xp[] is b_0, b_1, ..., b_{yn-1}, N*a_{yn}, ..., N*a_{xn-1}.
1209 
1210   Assumes a_i = 0 for xn <= i < N.
1211 
1212   Output in yp[] is N*a_0, ..., N*a_{yn-1}.
1213 
1214   Twisting parameter Z is described by z and lgH. If z == 0, then Z = basic
1215   2^lgH-th root of 1, and must have lgH >= lgN + 1. If z != 0, then Z = z^(-1)
1216   (and lgH is ignored).
1217 
1218   The buffers {xp,xn} and {yp,yn} may overlap, but only if xp == yp.
1219 
1220   Inputs are in [0, 4p), outputs are in [0, 4p).
1221 
1222   threads = number of OpenMP threads to use.
1223 
1224   (note: no function actually implements this interface in full generality!
1225   This is because it is tricky (and not that useful) to implement the twisting
1226   parameter when xn != yn.)
1227 */
1228 
1229 // NOTE: threads and twisting parameter are not used here.
1230 // NOTE: the code has been re-written and simplified so that
1231 //   everything is done in place, so xp == yp.
1232 
1233 
1234 
1235 
1236 //===================================================================
1237 
1238 
1239 
1240 
1241 
1242 
1243 // NOTE: these could be inlined, but I found the code generation
1244 // to be extremely sensitive to seemingly trivial changes,
1245 // so it seems safest to use macros instead.
1246 // w and wqinv are read only once.
1247 // q is read several times.
1248 // xx0, xx1 are read once and written once
1249 
1250 #define fwd_butterfly(xx0, xx1, w, q, wqinv)  \
1251 do \
1252 { \
1253    umint_t x0_ = xx0; \
1254    umint_t x1_ = xx1; \
1255    umint_t t_  = LazySubMod(x0_, x1_, q); \
1256    xx0 = LazyAddMod2(x0_, x1_, q); \
1257    xx1 = LazyMulModPrecon(t_, w, q, wqinv); \
1258 }  \
1259 while (0)
1260 
1261 #define fwd_butterfly_neg(xx0, xx1, w, q, wqinv)  \
1262 do \
1263 { \
1264    umint_t x0_ = xx0; \
1265    umint_t x1_ = xx1; \
1266    umint_t t_  = LazySubMod(x1_, x0_, q); /* NEG */ \
1267    xx0 = LazyAddMod2(x0_, x1_, q); \
1268    xx1 = LazyMulModPrecon(t_, w, q, wqinv); \
1269 }  \
1270 while (0)
1271 
1272 #define fwd_butterfly1(xx0, xx1, w, q, wqinv, w1, w1qinv)  \
1273 do \
1274 { \
1275    umint_t x0_ = xx0; \
1276    umint_t x1_ = xx1; \
1277    umint_t t_  = LazySubMod(x0_, x1_, q); \
1278    xx0 = LazyAddMod2(x0_, x1_, q); \
1279    xx1 = LazyMulModPrecon(LazyMulModPrecon(t_, w1, q, w1qinv), w, q, wqinv); \
1280 }  \
1281 while (0)
1282 
1283 
1284 #define fwd_butterfly0(xx0, xx1, q) \
1285 do   \
1286 {  \
1287    umint_t x0_ = xx0;  \
1288    umint_t x1_ = xx1;  \
1289    xx0 = LazyAddMod2(x0_, x1_, q);  \
1290    xx1 = LazySubMod2(x0_, x1_, q);  \
1291 }  \
1292 while (0)
1293 
1294 
1295 #define NTL_NEW_FFT_THRESH (11)
1296 
1297 struct new_mod_t {
1298    mint_t q;
1299    const mint_t **wtab;
1300    const mulmod_precon_t **wqinvtab;
1301 };
1302 
1303 
1304 
1305 
1306 
1307 // requires size divisible by 8
1308 static void
new_fft_layer(umint_t * xp,long blocks,long size,const mint_t * NTL_RESTRICT wtab,const mulmod_precon_t * NTL_RESTRICT wqinvtab,mint_t q)1309 new_fft_layer(umint_t* xp, long blocks, long size,
1310               const mint_t* NTL_RESTRICT wtab,
1311               const mulmod_precon_t* NTL_RESTRICT wqinvtab,
1312               mint_t q)
1313 {
1314   size /= 2;
1315 
1316   do
1317     {
1318       umint_t* NTL_RESTRICT xp0 = xp;
1319       umint_t* NTL_RESTRICT xp1 = xp + size;
1320 
1321       // first 4 butterflies
1322       fwd_butterfly0(xp0[0+0], xp1[0+0], q);
1323       fwd_butterfly(xp0[0+1], xp1[0+1], wtab[0+1], q, wqinvtab[0+1]);
1324       fwd_butterfly(xp0[0+2], xp1[0+2], wtab[0+2], q, wqinvtab[0+2]);
1325       fwd_butterfly(xp0[0+3], xp1[0+3], wtab[0+3], q, wqinvtab[0+3]);
1326 
1327       // 4-way unroll
1328       for (long j = 4; j < size; j += 4) {
1329         fwd_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
1330         fwd_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
1331         fwd_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
1332         fwd_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
1333       }
1334 
1335       xp += 2 * size;
1336     }
1337   while (--blocks != 0);
1338 }
1339 
1340 
1341 static void
new_fft_last_two_layers(umint_t * xp,long blocks,const mint_t * wtab,const mulmod_precon_t * wqinvtab,mint_t q)1342 new_fft_last_two_layers(umint_t* xp, long blocks,
1343 			  const mint_t* wtab, const mulmod_precon_t* wqinvtab,
1344                           mint_t q)
1345 {
1346   // 4th root of unity
1347   mint_t w = wtab[1];
1348   mulmod_precon_t wqinv = wqinvtab[1];
1349 
1350   do
1351     {
1352       umint_t u0 = xp[0];
1353       umint_t u1 = xp[1];
1354       umint_t u2 = xp[2];
1355       umint_t u3 = xp[3];
1356 
1357       umint_t v0 = LazyAddMod2(u0, u2, q);
1358       umint_t v2 = LazySubMod2(u0, u2, q);
1359       umint_t v1 = LazyAddMod2(u1, u3, q);
1360       umint_t t  = LazySubMod(u1, u3, q);
1361       umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
1362 
1363       xp[0] = LazyAddMod2(v0, v1, q);
1364       xp[1] = LazySubMod2(v0, v1, q);
1365       xp[2] = LazyAddMod2(v2, v3, q);
1366       xp[3] = LazySubMod2(v2, v3, q);
1367 
1368       xp += 4;
1369     }
1370   while (--blocks != 0);
1371 }
1372 
1373 
1374 
new_fft_base(umint_t * xp,long lgN,const new_mod_t & mod)1375 void new_fft_base(umint_t* xp, long lgN, const new_mod_t& mod)
1376 {
1377   if (lgN == 0) return;
1378 
1379   mint_t q = mod.q;
1380 
1381   if (lgN == 1)
1382     {
1383       umint_t x0 = xp[0];
1384       umint_t x1 = xp[1];
1385       xp[0] = LazyAddMod2(x0, x1, q);
1386       xp[1] = LazySubMod2(x0, x1, q);
1387       return;
1388     }
1389 
1390   const mint_t** wtab = mod.wtab;
1391   const mulmod_precon_t** wqinvtab = mod.wqinvtab;
1392 
1393   long N = 1L << lgN;
1394 
1395   for (long j = lgN, size = N, blocks = 1;
1396        j > 2; j--, blocks <<= 1, size >>= 1)
1397     new_fft_layer(xp, blocks, size, wtab[j], wqinvtab[j], q);
1398 
1399   new_fft_last_two_layers(xp, N/4, wtab[2], wqinvtab[2], q);
1400 }
1401 
1402 
1403 // Implements the truncated FFT interface, described above.
1404 // All computations done in place, and xp should point to
1405 // an array of size N, all of which may be overwitten
1406 // during the computation.
1407 static
new_fft_short(umint_t * xp,long yn,long xn,long lgN,const new_mod_t & mod)1408 void new_fft_short(umint_t* xp, long yn, long xn, long lgN,
1409                    const new_mod_t& mod)
1410 {
1411   long N = 1L << lgN;
1412 
1413   if (yn == N)
1414     {
1415       if (xn == N && lgN <= NTL_NEW_FFT_THRESH)
1416 	{
1417 	  // no truncation
1418 	  new_fft_base(xp, lgN, mod);
1419 	  return;
1420 	}
1421     }
1422 
1423   // divide-and-conquer algorithm
1424 
1425   long half = N >> 1;
1426   mint_t q = mod.q;
1427 
1428   if (yn <= half)
1429     {
1430       if (xn <= half)
1431 	{
1432 	  new_fft_short(xp, yn, xn, lgN - 1, mod);
1433 	}
1434       else
1435 	{
1436 	  xn -= half;
1437 
1438 	  // (X, Y) -> X + Y
1439 	  for (long j = 0; j < xn; j++)
1440 	    xp[j] = LazyAddMod2(xp[j], xp[j + half], q);
1441 
1442 	  new_fft_short(xp, yn, half, lgN - 1, mod);
1443 	}
1444     }
1445   else
1446     {
1447       yn -= half;
1448 
1449       umint_t* NTL_RESTRICT xp0 = xp;
1450       umint_t* NTL_RESTRICT xp1 = xp + half;
1451       const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
1452       const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
1453 
1454       if (xn <= half)
1455 	{
1456 	  // X -> (X, w*X)
1457 	  for (long j = 0; j < xn; j++)
1458 	    xp1[j] = LazyMulModPrecon(xp0[j], wtab[j], q, wqinvtab[j]);
1459 
1460 	  new_fft_short(xp0, half, xn, lgN - 1, mod);
1461 	  new_fft_short(xp1, yn, xn, lgN - 1, mod);
1462 	}
1463       else
1464 	{
1465 	  xn -= half;
1466 
1467 	  // (X, Y) -> (X + Y, w*(X - Y))
1468           // DIRT: assumes xn is a multiple of 4
1469           fwd_butterfly0(xp0[0], xp1[0], q);
1470           fwd_butterfly(xp0[1], xp1[1], wtab[1], q, wqinvtab[1]);
1471           fwd_butterfly(xp0[2], xp1[2], wtab[2], q, wqinvtab[2]);
1472           fwd_butterfly(xp0[3], xp1[3], wtab[3], q, wqinvtab[3]);
1473 	  for (long j = 4; j < xn; j+=4) {
1474             fwd_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
1475             fwd_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
1476             fwd_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
1477             fwd_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
1478           }
1479 
1480 	  // X -> (X, w*X)
1481 	  for (long j = xn; j < half; j++)
1482 	    xp1[j] = LazyMulModPrecon(xp0[j], wtab[j], q, wqinvtab[j]);
1483 
1484 	  new_fft_short(xp0, half, half, lgN - 1, mod);
1485 	  new_fft_short(xp1, yn, half, lgN - 1, mod);
1486 	}
1487     }
1488 }
1489 
1490 static
new_fft_short_notab(umint_t * xp,long yn,long xn,long lgN,const new_mod_t & mod,mint_t w,mulmod_precon_t wqinv)1491 void new_fft_short_notab(umint_t* xp, long yn, long xn, long lgN,
1492                    const new_mod_t& mod, mint_t w, mulmod_precon_t wqinv)
1493 // This version assumes that we only have tables up to level lgN-1,
1494 // and w generates the values at level lgN.
1495 // DIRT: requires xn even
1496 {
1497   long N = 1L << lgN;
1498 
1499   // divide-and-conquer algorithm
1500 
1501   long half = N >> 1;
1502   mint_t q = mod.q;
1503 
1504   if (yn <= half)
1505     {
1506       if (xn <= half)
1507 	{
1508 	  new_fft_short(xp, yn, xn, lgN - 1, mod);
1509 	}
1510       else
1511 	{
1512 	  xn -= half;
1513 
1514 	  // (X, Y) -> X + Y
1515 	  for (long j = 0; j < xn; j++)
1516 	    xp[j] = LazyAddMod2(xp[j], xp[j + half], q);
1517 
1518 	  new_fft_short(xp, yn, half, lgN - 1, mod);
1519 	}
1520     }
1521   else
1522     {
1523       yn -= half;
1524 
1525       umint_t* NTL_RESTRICT xp0 = xp;
1526       umint_t* NTL_RESTRICT xp1 = xp + half;
1527       const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN-1];
1528       const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN-1];
1529 
1530       if (xn <= half)
1531 	{
1532 	  // X -> (X, w*X)
1533 	  for (long j = 0, j_half = 0; j < xn; j+=2, j_half++) {
1534 	    xp1[j] = LazyMulModPrecon(xp0[j], wtab[j_half], q, wqinvtab[j_half]);
1535 	    xp1[j+1] = LazyMulModPrecon(LazyMulModPrecon(xp0[j+1], w, q, wqinv),
1536                                         wtab[j_half], q, wqinvtab[j_half]);
1537           }
1538 
1539 	  new_fft_short(xp0, half, xn, lgN - 1, mod);
1540 	  new_fft_short(xp1, yn, xn, lgN - 1, mod);
1541 	}
1542       else
1543 	{
1544 	  xn -= half;
1545 
1546 	  // (X, Y) -> (X + Y, w*(X - Y))
1547           fwd_butterfly0(xp0[0], xp1[0], q);
1548           fwd_butterfly(xp0[1], xp1[1], w, q, wqinv);
1549           long j = 2;
1550           long j_half = 1;
1551 	  for (; j < xn; j+=2, j_half++) {
1552             fwd_butterfly(xp0[j], xp1[j], wtab[j_half], q, wqinvtab[j_half]);
1553             fwd_butterfly1(xp0[j+1], xp1[j+1], wtab[j_half], q, wqinvtab[j_half], w, wqinv);
1554           }
1555 
1556 	  // X -> (X, w*X)
1557 	  for (; j < half; j+=2, j_half++) {
1558 	    xp1[j] = LazyMulModPrecon(xp0[j], wtab[j_half], q, wqinvtab[j_half]);
1559 	    xp1[j+1] = LazyMulModPrecon(LazyMulModPrecon(xp0[j+1], w, q, wqinv),
1560                                         wtab[j_half], q, wqinvtab[j_half]);
1561           }
1562 
1563 	  new_fft_short(xp0, half, half, lgN - 1, mod);
1564 	  new_fft_short(xp1, yn, half, lgN - 1, mod);
1565 	}
1566     }
1567 }
1568 
1569 
1570 //=====
1571 
1572 
1573 // NOTE: these "flipped" routines perform the same
1574 // functions as their normal, "unflipped" counter-parts,
1575 // except that they work with inverted roots.
1576 // They also perform no truncation, just to keep things simple.
1577 // All of this is necessary only to implement the UpdateMap
1578 // routines for ZZ_pX and zz_pX.
1579 
1580 // requires size divisible by 8
1581 static void
new_fft_layer_flipped(umint_t * xp,long blocks,long size,const mint_t * wtab,const mulmod_precon_t * wqinvtab,mint_t q)1582 new_fft_layer_flipped(umint_t* xp, long blocks, long size,
1583               const mint_t* wtab,
1584               const mulmod_precon_t* wqinvtab,
1585               mint_t q)
1586 {
1587   size /= 2;
1588 
1589   const mint_t* NTL_RESTRICT wtab1 = wtab + size;
1590   const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + size;
1591 
1592   do
1593     {
1594       umint_t* NTL_RESTRICT xp0 = xp;
1595       umint_t* NTL_RESTRICT xp1 = xp + size;
1596 
1597       // first 4 butterflies
1598       fwd_butterfly0(xp0[0+0], xp1[0+0], q);
1599       fwd_butterfly_neg(xp0[0+1], xp1[0+1], wtab1[-(0+1)], q, wqinvtab1[-(0+1)]);
1600       fwd_butterfly_neg(xp0[0+2], xp1[0+2], wtab1[-(0+2)], q, wqinvtab1[-(0+2)]);
1601       fwd_butterfly_neg(xp0[0+3], xp1[0+3], wtab1[-(0+3)], q, wqinvtab1[-(0+3)]);
1602 
1603       // 4-way unroll
1604       for (long j = 4; j < size; j += 4) {
1605         fwd_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
1606         fwd_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
1607         fwd_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
1608         fwd_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
1609       }
1610 
1611       xp += 2 * size;
1612     }
1613   while (--blocks != 0);
1614 }
1615 
1616 
1617 
1618 static void
new_fft_last_two_layers_flipped(umint_t * xp,long blocks,const mint_t * wtab,const mulmod_precon_t * wqinvtab,mint_t q)1619 new_fft_last_two_layers_flipped(umint_t* xp, long blocks,
1620 			  const mint_t* wtab, const mulmod_precon_t* wqinvtab,
1621                           mint_t q)
1622 {
1623   // 4th root of unity
1624   mint_t w = wtab[1];
1625   mulmod_precon_t wqinv = wqinvtab[1];
1626 
1627   do
1628     {
1629       umint_t u0 = xp[0];
1630       umint_t u1 = xp[1];
1631       umint_t u2 = xp[2];
1632       umint_t u3 = xp[3];
1633 
1634       umint_t v0 = LazyAddMod2(u0, u2, q);
1635       umint_t v2 = LazySubMod2(u0, u2, q);
1636       umint_t v1 = LazyAddMod2(u1, u3, q);
1637       umint_t t  = LazySubMod(u3, u1, q); // NEG
1638       umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
1639 
1640       xp[0] = LazyAddMod2(v0, v1, q);
1641       xp[1] = LazySubMod2(v0, v1, q);
1642       xp[2] = LazyAddMod2(v2, v3, q);
1643       xp[3] = LazySubMod2(v2, v3, q);
1644 
1645       xp += 4;
1646     }
1647   while (--blocks != 0);
1648 }
1649 
1650 
1651 
new_fft_base_flipped(umint_t * xp,long lgN,const new_mod_t & mod)1652 void new_fft_base_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
1653 {
1654   if (lgN == 0) return;
1655 
1656   mint_t q = mod.q;
1657 
1658   if (lgN == 1)
1659     {
1660       umint_t x0 = xp[0];
1661       umint_t x1 = xp[1];
1662       xp[0] = LazyAddMod2(x0, x1, q);
1663       xp[1] = LazySubMod2(x0, x1, q);
1664       return;
1665     }
1666 
1667   const mint_t** wtab = mod.wtab;
1668   const mulmod_precon_t** wqinvtab = mod.wqinvtab;
1669 
1670   long N = 1L << lgN;
1671 
1672   for (long j = lgN, size = N, blocks = 1;
1673        j > 2; j--, blocks <<= 1, size >>= 1)
1674     new_fft_layer_flipped(xp, blocks, size, wtab[j], wqinvtab[j], q);
1675 
1676   new_fft_last_two_layers_flipped(xp, N/4, wtab[2], wqinvtab[2], q);
1677 }
1678 
1679 
1680 static
new_fft_short_flipped(umint_t * xp,long lgN,const new_mod_t & mod)1681 void new_fft_short_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
1682 {
1683   long N = 1L << lgN;
1684 
1685   if (lgN <= NTL_NEW_FFT_THRESH)
1686     {
1687       new_fft_base_flipped(xp, lgN, mod);
1688       return;
1689     }
1690 
1691   // divide-and-conquer algorithm
1692 
1693   long half = N >> 1;
1694   mint_t q = mod.q;
1695 
1696   umint_t* NTL_RESTRICT xp0 = xp;
1697   umint_t* NTL_RESTRICT xp1 = xp + half;
1698   const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN] + half;
1699   const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN] + half;
1700 
1701   // (X, Y) -> (X + Y, w*(X - Y))
1702 
1703   fwd_butterfly0(xp0[0], xp1[0], q);
1704   fwd_butterfly_neg(xp0[1], xp1[1], wtab[-1], q, wqinvtab[-1]);
1705   fwd_butterfly_neg(xp0[2], xp1[2], wtab[-2], q, wqinvtab[-2]);
1706   fwd_butterfly_neg(xp0[3], xp1[3], wtab[-3], q, wqinvtab[-3]);
1707   for (long j = 4; j < half; j+=4) {
1708     fwd_butterfly_neg(xp0[j+0], xp1[j+0], wtab[-(j+0)], q, wqinvtab[-(j+0)]);
1709     fwd_butterfly_neg(xp0[j+1], xp1[j+1], wtab[-(j+1)], q, wqinvtab[-(j+1)]);
1710     fwd_butterfly_neg(xp0[j+2], xp1[j+2], wtab[-(j+2)], q, wqinvtab[-(j+2)]);
1711     fwd_butterfly_neg(xp0[j+3], xp1[j+3], wtab[-(j+3)], q, wqinvtab[-(j+3)]);
1712   }
1713 
1714   new_fft_short_flipped(xp0, lgN - 1, mod);
1715   new_fft_short_flipped(xp1, lgN - 1, mod);
1716 }
1717 
1718 
1719 
1720 // IFFT (inverse truncated FFT)
1721 
1722 
1723 #define inv_butterfly0(xx0, xx1, q)  \
1724 do   \
1725 {  \
1726    umint_t x0_ = LazyReduce2(xx0, q);  \
1727    umint_t x1_ = LazyReduce2(xx1, q);  \
1728    xx0 = LazyAddMod(x0_, x1_, q);  \
1729    xx1 = LazySubMod(x0_, x1_, q);  \
1730 } while (0)
1731 
1732 
1733 #define inv_butterfly_neg(xx0, xx1, w, q, wqinv)  \
1734 do  \
1735 {  \
1736    umint_t x0_ = LazyReduce2(xx0, q);  \
1737    umint_t x1_ = xx1;  \
1738    umint_t t_ = LazyMulModPrecon(x1_, w, q, wqinv);   \
1739    xx0 = LazySubMod(x0_, t_, q);  /* NEG */   \
1740    xx1 = LazyAddMod(x0_, t_, q);  /* NEG */   \
1741 } while (0)
1742 
1743 #define inv_butterfly(xx0, xx1, w, q, wqinv)  \
1744 do  \
1745 {  \
1746    umint_t x0_ = LazyReduce2(xx0, q);  \
1747    umint_t x1_ = xx1;  \
1748    umint_t t_ = LazyMulModPrecon(x1_, w, q, wqinv);   \
1749    xx0 = LazyAddMod(x0_, t_, q);    \
1750    xx1 = LazySubMod(x0_, t_, q);    \
1751 } while (0)
1752 
1753 #define inv_butterfly1_neg(xx0, xx1, w, q, wqinv, w1, w1qinv)  \
1754 do  \
1755 {  \
1756    umint_t x0_ = LazyReduce2(xx0, q);  \
1757    umint_t x1_ = xx1;  \
1758    umint_t t_ = LazyMulModPrecon(LazyMulModPrecon(x1_, w1, q, w1qinv), w, q, wqinv);   \
1759    xx0 = LazySubMod(x0_, t_, q);  /* NEG */   \
1760    xx1 = LazyAddMod(x0_, t_, q);  /* NEG */   \
1761 } while (0)
1762 
1763 
1764 static
1765 void new_ifft_short2(umint_t* yp, long yn, long lgN, const new_mod_t& mod);
1766 
1767 
1768 
1769 // requires size divisible by 8
1770 static void
new_ifft_layer(umint_t * xp,long blocks,long size,const mint_t * wtab,const mulmod_precon_t * wqinvtab,mint_t q)1771 new_ifft_layer(umint_t* xp, long blocks, long size,
1772 		 const mint_t* wtab,
1773                  const mulmod_precon_t* wqinvtab, mint_t q)
1774 {
1775 
1776   size /= 2;
1777   const mint_t* NTL_RESTRICT wtab1 = wtab + size;
1778   const mulmod_precon_t* NTL_RESTRICT wqinvtab1 = wqinvtab + size;
1779 
1780   do
1781     {
1782 
1783       umint_t* NTL_RESTRICT xp0 = xp;
1784       umint_t* NTL_RESTRICT xp1 = xp + size;
1785 
1786 
1787       // first 4 butterflies
1788       inv_butterfly0(xp0[0], xp1[0], q);
1789       inv_butterfly_neg(xp0[1], xp1[1], wtab1[-1], q, wqinvtab1[-1]);
1790       inv_butterfly_neg(xp0[2], xp1[2], wtab1[-2], q, wqinvtab1[-2]);
1791       inv_butterfly_neg(xp0[3], xp1[3], wtab1[-3], q, wqinvtab1[-3]);
1792 
1793       // 4-way unroll
1794       for (long j = 4; j < size; j+= 4) {
1795 	 inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
1796 	 inv_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
1797 	 inv_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
1798 	 inv_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
1799       }
1800 
1801       xp += 2 * size;
1802     }
1803   while (--blocks != 0);
1804 }
1805 
1806 
1807 static void
new_ifft_first_two_layers(umint_t * xp,long blocks,const mint_t * wtab,const mulmod_precon_t * wqinvtab,mint_t q)1808 new_ifft_first_two_layers(umint_t* xp, long blocks, const mint_t* wtab,
1809                           const mulmod_precon_t* wqinvtab, mint_t q)
1810 {
1811   // 4th root of unity
1812   mint_t w = wtab[1];
1813   mulmod_precon_t wqinv = wqinvtab[1];
1814 
1815   do
1816     {
1817       umint_t u0 = LazyReduce2(xp[0], q);
1818       umint_t u1 = LazyReduce2(xp[1], q);
1819       umint_t u2 = LazyReduce2(xp[2], q);
1820       umint_t u3 = LazyReduce2(xp[3], q);
1821 
1822       umint_t v0 = LazyAddMod2(u0, u1, q);
1823       umint_t v1 = LazySubMod2(u0, u1, q);
1824       umint_t v2 = LazyAddMod2(u2, u3, q);
1825       umint_t t  = LazySubMod(u2, u3, q);
1826       umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
1827 
1828       xp[0] = LazyAddMod(v0, v2, q);
1829       xp[2] = LazySubMod(v0, v2, q);
1830       xp[1] = LazySubMod(v1, v3, q);  // NEG
1831       xp[3] = LazyAddMod(v1, v3, q);  // NEG
1832 
1833       xp += 4;
1834     }
1835   while (--blocks != 0);
1836 }
1837 
1838 
1839 
1840 static
new_ifft_base(umint_t * xp,long lgN,const new_mod_t & mod)1841 void new_ifft_base(umint_t* xp, long lgN, const new_mod_t& mod)
1842 {
1843   if (lgN == 0) return;
1844 
1845   mint_t q = mod.q;
1846 
1847   if (lgN == 1)
1848     {
1849       umint_t x0 = LazyReduce2(xp[0], q);
1850       umint_t x1 = LazyReduce2(xp[1], q);
1851       xp[0] = LazyAddMod(x0, x1, q);
1852       xp[1] = LazySubMod(x0, x1, q);
1853       return;
1854     }
1855 
1856   const mint_t** wtab = mod.wtab;
1857   const mulmod_precon_t** wqinvtab = mod.wqinvtab;
1858 
1859   long blocks = 1L << (lgN - 2);
1860   new_ifft_first_two_layers(xp, blocks, wtab[2], wqinvtab[2], q);
1861   blocks >>= 1;
1862 
1863   long size = 8;
1864   for (long j = 3; j <= lgN; j++, blocks >>= 1, size <<= 1)
1865     new_ifft_layer(xp, blocks, size, wtab[j], wqinvtab[j], q);
1866 }
1867 
1868 
1869 static
new_ifft_short1(umint_t * xp,long yn,long lgN,const new_mod_t & mod)1870 void new_ifft_short1(umint_t* xp, long yn, long lgN, const new_mod_t& mod)
1871 
1872 // Implements truncated inverse FFT interface, but with xn==yn.
1873 // All computations are done in place.
1874 
1875 {
1876   long N = 1L << lgN;
1877 
1878   if (yn == N && lgN <= NTL_NEW_FFT_THRESH)
1879     {
1880       // no truncation
1881       new_ifft_base(xp, lgN, mod);
1882       return;
1883     }
1884 
1885   // divide-and-conquer algorithm
1886 
1887   long half = N >> 1;
1888   mint_t q = mod.q;
1889 
1890   if (yn <= half)
1891     {
1892       // X -> 2X
1893       for (long j = 0; j < yn; j++)
1894       	xp[j] = LazyDoubleMod4(xp[j], q);
1895 
1896       new_ifft_short1(xp, yn, lgN - 1, mod);
1897     }
1898   else
1899     {
1900       umint_t* NTL_RESTRICT xp0 = xp;
1901       umint_t* NTL_RESTRICT xp1 = xp + half;
1902       const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
1903       const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
1904 
1905       new_ifft_short1(xp0, half, lgN - 1, mod);
1906 
1907       yn -= half;
1908 
1909       // X -> (2X, w*X)
1910       for (long j = yn; j < half; j++)
1911 	{
1912 	  umint_t x0 = xp0[j];
1913 	  xp0[j] = LazyDoubleMod4(x0, q);
1914 	  xp1[j] = LazyMulModPrecon(x0, wtab[j], q, wqinvtab[j]);
1915 	}
1916 
1917       new_ifft_short2(xp1, yn, lgN - 1, mod);
1918 
1919       // (X, Y) -> (X + Y/w, X - Y/w)
1920       {
1921 	const mint_t* NTL_RESTRICT wtab1 = wtab + half;
1922 	const mulmod_precon_t* NTL_RESTRICT wqinvtab1 =  wqinvtab + half;
1923 
1924 	// DIRT: assumes yn is a multiple of 4
1925 	inv_butterfly0(xp0[0], xp1[0], q);
1926 	inv_butterfly_neg(xp0[1], xp1[1], wtab1[-1], q, wqinvtab1[-1]);
1927 	inv_butterfly_neg(xp0[2], xp1[2], wtab1[-2], q, wqinvtab1[-2]);
1928 	inv_butterfly_neg(xp0[3], xp1[3], wtab1[-3], q, wqinvtab1[-3]);
1929 	for (long j = 4; j < yn; j+=4) {
1930 	  inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
1931 	  inv_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
1932 	  inv_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
1933 	  inv_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
1934 	}
1935       }
1936     }
1937 }
1938 
1939 
1940 static
new_ifft_short1_notab(umint_t * xp,long yn,long lgN,const new_mod_t & mod,mint_t w,mulmod_precon_t wqinv,mint_t iw,mulmod_precon_t iwqinv)1941 void new_ifft_short1_notab(umint_t* xp, long yn, long lgN, const new_mod_t& mod,
1942                            mint_t w, mulmod_precon_t wqinv,
1943                            mint_t iw, mulmod_precon_t iwqinv)
1944 // This version assumes that we only have tables up to level lgN-1,
1945 // and w generates the values at level lgN.
1946 // DIRT: requires yn even
1947 {
1948   long N = 1L << lgN;
1949 
1950   // divide-and-conquer algorithm
1951 
1952   long half = N >> 1;
1953   mint_t q = mod.q;
1954 
1955   if (yn <= half)
1956     {
1957       // X -> 2X
1958       for (long j = 0; j < yn; j++)
1959       	xp[j] = LazyDoubleMod4(xp[j], q);
1960 
1961       new_ifft_short1(xp, yn, lgN - 1, mod);
1962     }
1963   else
1964     {
1965       umint_t* NTL_RESTRICT xp0 = xp;
1966       umint_t* NTL_RESTRICT xp1 = xp + half;
1967       const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN-1];
1968       const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN-1];
1969 
1970       new_ifft_short1(xp0, half, lgN - 1, mod);
1971 
1972       yn -= half;
1973 
1974       // X -> (2X, w*X)
1975       for (long j = yn, j_half = yn/2; j < half; j+=2, j_half++) {
1976 	{
1977 	  umint_t x0 = xp0[j+0];
1978 	  xp0[j+0] = LazyDoubleMod4(x0, q);
1979 	  xp1[j+0] = LazyMulModPrecon(x0, wtab[j_half], q, wqinvtab[j_half]);
1980 	}
1981 	{
1982 	  umint_t x0 = xp0[j+1];
1983 	  xp0[j+1] = LazyDoubleMod4(x0, q);
1984 	  xp1[j+1] = LazyMulModPrecon(LazyMulModPrecon(x0, w, q, wqinv),
1985                                       wtab[j_half], q, wqinvtab[j_half]);
1986 	}
1987       }
1988 
1989       new_ifft_short2(xp1, yn, lgN - 1, mod);
1990 
1991       // (X, Y) -> (X + Y/w, X - Y/w)
1992       {
1993 	const mint_t* NTL_RESTRICT wtab1 = wtab + half/2;
1994 	const mulmod_precon_t* NTL_RESTRICT wqinvtab1 =  wqinvtab + half/2;
1995 
1996 	inv_butterfly0(xp0[0], xp1[0], q);
1997 	inv_butterfly(xp0[1], xp1[1], iw, q, iwqinv);
1998 	for (long j = 2, j_half = 1; j < yn; j+=2, j_half++) {
1999 	  inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-j_half], q, wqinvtab1[-j_half]);
2000 	  inv_butterfly1_neg(xp0[j+1], xp1[j+1], wtab1[-j_half], q, wqinvtab1[-j_half], iw, iwqinv);
2001 	}
2002       }
2003     }
2004 }
2005 
2006 
2007 
2008 //=========
2009 
2010 
2011 // requires size divisible by 8
2012 static void
new_ifft_layer_flipped(umint_t * xp,long blocks,long size,const mint_t * NTL_RESTRICT wtab,const mulmod_precon_t * NTL_RESTRICT wqinvtab,mint_t q)2013 new_ifft_layer_flipped(umint_t* xp, long blocks, long size,
2014 		 const mint_t* NTL_RESTRICT wtab,
2015                  const mulmod_precon_t* NTL_RESTRICT wqinvtab, mint_t q)
2016 {
2017 
2018   size /= 2;
2019 
2020   do
2021     {
2022 
2023       umint_t* NTL_RESTRICT xp0 = xp;
2024       umint_t* NTL_RESTRICT xp1 = xp + size;
2025 
2026 
2027       // first 4 butterflies
2028       inv_butterfly0(xp0[0], xp1[0], q);
2029       inv_butterfly(xp0[1], xp1[1], wtab[1], q, wqinvtab[1]);
2030       inv_butterfly(xp0[2], xp1[2], wtab[2], q, wqinvtab[2]);
2031       inv_butterfly(xp0[3], xp1[3], wtab[3], q, wqinvtab[3]);
2032 
2033       // 4-way unroll
2034       for (long j = 4; j < size; j+= 4) {
2035 	 inv_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
2036 	 inv_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
2037 	 inv_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
2038 	 inv_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
2039       }
2040 
2041       xp += 2 * size;
2042     }
2043   while (--blocks != 0);
2044 }
2045 
2046 
2047 static void
new_ifft_first_two_layers_flipped(umint_t * xp,long blocks,const mint_t * wtab,const mulmod_precon_t * wqinvtab,mint_t q)2048 new_ifft_first_two_layers_flipped(umint_t* xp, long blocks, const mint_t* wtab,
2049                           const mulmod_precon_t* wqinvtab, mint_t q)
2050 {
2051   // 4th root of unity
2052   mint_t w = wtab[1];
2053   mulmod_precon_t wqinv = wqinvtab[1];
2054 
2055   do
2056     {
2057       umint_t u0 = LazyReduce2(xp[0], q);
2058       umint_t u1 = LazyReduce2(xp[1], q);
2059       umint_t u2 = LazyReduce2(xp[2], q);
2060       umint_t u3 = LazyReduce2(xp[3], q);
2061 
2062       umint_t v0 = LazyAddMod2(u0, u1, q);
2063       umint_t v1 = LazySubMod2(u0, u1, q);
2064       umint_t v2 = LazyAddMod2(u2, u3, q);
2065       umint_t t  = LazySubMod(u2, u3, q);
2066       umint_t v3 = LazyMulModPrecon(t, w, q, wqinv);
2067 
2068       xp[0] = LazyAddMod(v0, v2, q);
2069       xp[2] = LazySubMod(v0, v2, q);
2070       xp[1] = LazyAddMod(v1, v3, q);
2071       xp[3] = LazySubMod(v1, v3, q);
2072 
2073       xp += 4;
2074     }
2075   while (--blocks != 0);
2076 }
2077 
2078 
2079 
2080 static
new_ifft_base_flipped(umint_t * xp,long lgN,const new_mod_t & mod)2081 void new_ifft_base_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
2082 {
2083   if (lgN == 0) return;
2084 
2085   mint_t q = mod.q;
2086 
2087   if (lgN == 1)
2088     {
2089       umint_t x0 = LazyReduce2(xp[0], q);
2090       umint_t x1 = LazyReduce2(xp[1], q);
2091       xp[0] = LazyAddMod(x0, x1, q);
2092       xp[1] = LazySubMod(x0, x1, q);
2093       return;
2094     }
2095 
2096   const mint_t** wtab = mod.wtab;
2097   const mulmod_precon_t** wqinvtab = mod.wqinvtab;
2098 
2099   long blocks = 1L << (lgN - 2);
2100   new_ifft_first_two_layers_flipped(xp, blocks, wtab[2], wqinvtab[2], q);
2101   blocks >>= 1;
2102 
2103   long size = 8;
2104   for (long j = 3; j <= lgN; j++, blocks >>= 1, size <<= 1)
2105     new_ifft_layer_flipped(xp, blocks, size, wtab[j], wqinvtab[j], q);
2106 }
2107 
2108 
2109 static
new_ifft_short1_flipped(umint_t * xp,long lgN,const new_mod_t & mod)2110 void new_ifft_short1_flipped(umint_t* xp, long lgN, const new_mod_t& mod)
2111 {
2112   long N = 1L << lgN;
2113 
2114   if (lgN <= NTL_NEW_FFT_THRESH)
2115     {
2116       new_ifft_base_flipped(xp, lgN, mod);
2117       return;
2118     }
2119 
2120   // divide-and-conquer algorithm
2121 
2122   long half = N >> 1;
2123   mint_t q = mod.q;
2124 
2125   umint_t* NTL_RESTRICT xp0 = xp;
2126   umint_t* NTL_RESTRICT xp1 = xp + half;
2127   const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
2128   const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
2129 
2130   new_ifft_short1_flipped(xp0, lgN - 1, mod);
2131   new_ifft_short1_flipped(xp1, lgN - 1, mod);
2132 
2133   // (X, Y) -> (X + Y*w, X - Y*w)
2134 
2135   inv_butterfly0(xp0[0], xp1[0], q);
2136   inv_butterfly(xp0[1], xp1[1], wtab[1], q, wqinvtab[1]);
2137   inv_butterfly(xp0[2], xp1[2], wtab[2], q, wqinvtab[2]);
2138   inv_butterfly(xp0[3], xp1[3], wtab[3], q, wqinvtab[3]);
2139   for (long j = 4; j < half; j+=4) {
2140     inv_butterfly(xp0[j+0], xp1[j+0], wtab[j+0], q, wqinvtab[j+0]);
2141     inv_butterfly(xp0[j+1], xp1[j+1], wtab[j+1], q, wqinvtab[j+1]);
2142     inv_butterfly(xp0[j+2], xp1[j+2], wtab[j+2], q, wqinvtab[j+2]);
2143     inv_butterfly(xp0[j+3], xp1[j+3], wtab[j+3], q, wqinvtab[j+3]);
2144   }
2145 }
2146 
2147 //=========
2148 
2149 
2150 
2151 static
new_ifft_short2(umint_t * xp,long yn,long lgN,const new_mod_t & mod)2152 void new_ifft_short2(umint_t* xp, long yn, long lgN, const new_mod_t& mod)
2153 
2154 // Implements truncated inverse FFT interface, but with xn==N.
2155 // All computations are done in place.
2156 
2157 {
2158   long N = 1L << lgN;
2159 
2160   if (yn == N && lgN <= NTL_NEW_FFT_THRESH)
2161     {
2162       // no truncation
2163       new_ifft_base(xp, lgN, mod);
2164       return;
2165     }
2166 
2167   // divide-and-conquer algorithm
2168 
2169   long half = N >> 1;
2170   mint_t q = mod.q;
2171 
2172   if (yn <= half)
2173     {
2174       // X -> 2X
2175       for (long j = 0; j < yn; j++)
2176      	xp[j] = LazyDoubleMod4(xp[j], q);
2177       // (X, Y) -> X + Y
2178       for (long j = yn; j < half; j++)
2179 	xp[j] = LazyAddMod4(xp[j], xp[j + half], q);
2180 
2181       new_ifft_short2(xp, yn, lgN - 1, mod);
2182 
2183       // (X, Y) -> X - Y
2184       for (long j = 0; j < yn; j++)
2185 	xp[j] = LazySubMod4(xp[j], xp[j + half], q);
2186     }
2187   else
2188     {
2189       umint_t* NTL_RESTRICT xp0 = xp;
2190       umint_t* NTL_RESTRICT xp1 = xp + half;
2191       const mint_t* NTL_RESTRICT wtab = mod.wtab[lgN];
2192       const mulmod_precon_t* NTL_RESTRICT wqinvtab = mod.wqinvtab[lgN];
2193 
2194       new_ifft_short1(xp0, half, lgN - 1, mod);
2195 
2196       yn -= half;
2197 
2198 
2199       // (X, Y) -> (2X - Y, w*(X - Y))
2200       for (long j = yn; j < half; j++)
2201 	{
2202 	  umint_t x0 = xp0[j];
2203 	  umint_t x1 = xp1[j];
2204 	  umint_t u = LazySubMod4(x0, x1, q);
2205 	  xp0[j] = LazyAddMod4(x0, u, q);
2206 	  xp1[j] = LazyMulModPrecon(u, wtab[j], q, wqinvtab[j]);
2207 	}
2208 
2209       new_ifft_short2(xp1, yn, lgN - 1, mod);
2210 
2211       // (X, Y) -> (X + Y/w, X - Y/w)
2212       {
2213 	const mint_t* NTL_RESTRICT wtab1 = wtab + half;
2214 	const mulmod_precon_t* NTL_RESTRICT wqinvtab1 =  wqinvtab + half;
2215 
2216 	// DIRT: assumes yn is a multiple of 4
2217 	inv_butterfly0(xp0[0], xp1[0], q);
2218 	inv_butterfly_neg(xp0[1], xp1[1], wtab1[-1], q, wqinvtab1[-1]);
2219 	inv_butterfly_neg(xp0[2], xp1[2], wtab1[-2], q, wqinvtab1[-2]);
2220 	inv_butterfly_neg(xp0[3], xp1[3], wtab1[-3], q, wqinvtab1[-3]);
2221 	for (long j = 4; j < yn; j+=4) {
2222 	  inv_butterfly_neg(xp0[j+0], xp1[j+0], wtab1[-(j+0)], q, wqinvtab1[-(j+0)]);
2223 	  inv_butterfly_neg(xp0[j+1], xp1[j+1], wtab1[-(j+1)], q, wqinvtab1[-(j+1)]);
2224 	  inv_butterfly_neg(xp0[j+2], xp1[j+2], wtab1[-(j+2)], q, wqinvtab1[-(j+2)]);
2225 	  inv_butterfly_neg(xp0[j+3], xp1[j+3], wtab1[-(j+3)], q, wqinvtab1[-(j+3)]);
2226 	}
2227       }
2228     }
2229 }
2230 
2231 
2232 //=============================================
2233 
2234 // HIGH LEVEL ROUTINES
2235 
2236 //=========== FFT without tables ===========
2237 
2238 
NTL_TLS_GLOBAL_DECL(Vec<umint_t>,AA_store)2239 NTL_TLS_GLOBAL_DECL(Vec<umint_t>, AA_store)
2240 
2241 NTL_TLS_GLOBAL_DECL(Vec<FFTVectorPair>, mul_vec)
2242 
2243 void new_fft_notab(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
2244              long yn, long xn)
2245 
2246 // Performs a high-level FFT.  Inputs and outputs are in the range [0,q).
2247 // xn and yn are as described above in the truncated FFT interface.
2248 // Both A and a should point to arrays of size 2^k,
2249 // and should either be the same or not overlap at all.
2250 // This version does not use precomputed tables.
2251 
2252 {
2253    mint_t q = info.q;
2254 
2255    if (k <= 1) {
2256       if (k == 0) {
2257 	 A[0] = a[0];
2258 	 return;
2259       }
2260       if (k == 1) {
2261          mint_t A0 = AddMod(a[0], a[1], q);
2262          mint_t A1 = SubMod(a[0], a[1], q);
2263          A[0] = A0;
2264          A[1] = A1;
2265 	 return;
2266       }
2267    }
2268 
2269    // assume k > 1
2270    const mint_t *root = info.RootTable[0].elts();
2271    mulmod_t qinv = info.qinv;
2272 
2273    NTL_TLS_GLOBAL_ACCESS(mul_vec);
2274    ComputeMultipliers(mul_vec, k-1, q, qinv, root);
2275 
2276    long n = 1L << k;
2277 
2278    const mint_t *wtab[NTL_FFTMaxRoot+1];
2279    for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
2280 
2281    const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2282    for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
2283 
2284    new_mod_t mod;
2285    mod.q = q;
2286    mod.wtab = &wtab[0];
2287    mod.wqinvtab = &wqinvtab[0];
2288 
2289    mint_t w = info.RootTable[0][k];
2290    mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
2291 
2292 #ifdef NTL_FFT_USEBUF
2293    NTL_TLS_GLOBAL_ACCESS(AA_store);
2294    AA_store.SetLength(1L << k);
2295    umint_t *AA = AA_store.elts();
2296 
2297    for (long i = 0; i < xn; i++) AA[i] = a[i];
2298 
2299    new_fft_short_notab(AA, yn, xn, k, mod, w, wqinv);
2300 
2301    for (long i = 0; i < yn; i++) {
2302       A[i] = LazyReduce1(AA[i], q);
2303    }
2304 #else
2305    umint_t *AA = (umint_t *) A;
2306    if (a != A) for (long i = 0; i < xn; i++) AA[i] = a[i];
2307 
2308    new_fft_short_notab(AA, yn, xn, k, mod, w, wqinv);
2309 
2310    for (long i = 0; i < yn; i++) {
2311       AA[i] = LazyReduce1(AA[i], q);
2312    }
2313 #endif
2314 }
2315 
2316 
new_fft_flipped_notab(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info)2317 void new_fft_flipped_notab(mint_t* A, const mint_t* a, long k,
2318              const FFTPrimeInfo& info)
2319 
2320 // Performs a high-level FFT.  Inputs and outputs are in the range [0,q).
2321 // Both A and a should point to arrays of size 2^k,
2322 // and should either be the same or not overlap at all.
2323 // This version is "flipped" -- it uses inverted roots,
2324 // multiplies by 2^{-k}, and performs no truncations.
2325 // This version does not use precomputed tables.
2326 
2327 {
2328    mint_t q = info.q;
2329 
2330    if (k <= 1) {
2331       if (k == 0) {
2332 	 A[0] = a[0];
2333 	 return;
2334       }
2335       if (k == 1) {
2336          mint_t two_inv = info.TwoInvTable[1];
2337          mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
2338          mint_t A0 = AddMod(a[0], a[1], q);
2339          mint_t A1 = SubMod(a[0], a[1], q);
2340          A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
2341          A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
2342 	 return;
2343       }
2344    }
2345 
2346    // assume k > 1
2347    const mint_t *root = info.RootTable[1].elts();
2348    mulmod_t qinv = info.qinv;
2349 
2350    NTL_TLS_GLOBAL_ACCESS(mul_vec);
2351    ComputeMultipliers(mul_vec, k-1, q, qinv, root);
2352 
2353    long n = 1L << k;
2354 
2355    const mint_t *wtab[NTL_FFTMaxRoot+1];
2356    for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
2357 
2358    const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2359    for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
2360 
2361    new_mod_t mod;
2362    mod.q = q;
2363    mod.wtab = &wtab[0];
2364    mod.wqinvtab = &wqinvtab[0];
2365 
2366    mint_t w = info.RootTable[1][k];
2367    mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
2368 
2369    mint_t two_inv = info.TwoInvTable[k];
2370    mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
2371 
2372 #ifdef NTL_FFT_USEBUF
2373    NTL_TLS_GLOBAL_ACCESS(AA_store);
2374    AA_store.SetLength(1L << k);
2375    umint_t *AA = AA_store.elts();
2376 
2377    for (long i = 0; i < n; i++) AA[i] = a[i];
2378 
2379    new_fft_short_notab(AA, n, n, k, mod, w, wqinv);
2380 
2381    for (long i = 0; i < n; i++) {
2382       umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2383       A[i] = LazyReduce1(tmp, q);
2384    }
2385 #else
2386    umint_t *AA = (umint_t *) A;
2387    if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
2388 
2389    new_fft_short_notab(AA, n, n, k, mod, w, wqinv);
2390 
2391    for (long i = 0; i < n; i++) {
2392       umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2393       AA[i] = LazyReduce1(tmp, q);
2394    }
2395 
2396 #endif
2397 }
2398 
2399 
2400 //=========== Inverse FFT without tables  ===========
2401 
new_ifft_notab(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info,long yn)2402 void new_ifft_notab(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
2403               long yn)
2404 
2405 // Performs a high-level IFFT.  Inputs and outputs are in the range [0,q).
2406 // yn==xn are as described above in the truncated FFT interface.
2407 // Both A and a should point to arrays of size 2^k,
2408 // and should either be the same or not overlap at all.
2409 // Multiplies by 2^{-k}.
2410 // This version does not use precomputed tables.
2411 
2412 {
2413    mint_t q = info.q;
2414 
2415    if (k <= 1) {
2416       if (k == 0) {
2417 	 A[0] = a[0];
2418 	 return;
2419       }
2420       if (k == 1) {
2421          mint_t two_inv = info.TwoInvTable[1];
2422          mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
2423          mint_t A0 = AddMod(a[0], a[1], q);
2424          mint_t A1 = SubMod(a[0], a[1], q);
2425          A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
2426          A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
2427 	 return;
2428       }
2429    }
2430 
2431    // assume k > 1
2432    const mint_t *root = info.RootTable[0].elts();
2433    mulmod_t qinv = info.qinv;
2434 
2435    NTL_TLS_GLOBAL_ACCESS(mul_vec);
2436    ComputeMultipliers(mul_vec, k-1, q, qinv, root);
2437 
2438    long n = 1L << k;
2439 
2440    const mint_t *wtab[NTL_FFTMaxRoot+1];
2441    for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
2442 
2443    const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2444    for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
2445 
2446    new_mod_t mod;
2447    mod.q = q;
2448    mod.wtab = &wtab[0];
2449    mod.wqinvtab = &wqinvtab[0];
2450 
2451 
2452    mint_t w = info.RootTable[0][k];
2453    mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
2454 
2455    mint_t iw = info.RootTable[1][k];
2456    mulmod_precon_t iwqinv = LazyPrepMulModPrecon(iw, q, info.qinv);
2457 
2458    mint_t two_inv = info.TwoInvTable[k];
2459    mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
2460 
2461 #ifdef NTL_FFT_USEBUF
2462    NTL_TLS_GLOBAL_ACCESS(AA_store);
2463    AA_store.SetLength(1L << k);
2464    umint_t *AA = AA_store.elts();
2465 
2466    for (long i = 0; i < yn; i++) AA[i] = a[i];
2467 
2468    new_ifft_short1_notab(AA, yn, k, mod, w, wqinv, iw, iwqinv);
2469 
2470    for (long i = 0; i < yn; i++) {
2471       umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2472       A[i] = LazyReduce1(tmp, q);
2473    }
2474 #else
2475    umint_t *AA = (umint_t *) A;
2476    if (a != A) for (long i = 0; i < yn; i++) AA[i] = a[i];
2477 
2478    new_ifft_short1_notab(AA, yn, k, mod, w, wqinv, iw, iwqinv);
2479 
2480    for (long i = 0; i < yn; i++) {
2481       umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2482       AA[i] = LazyReduce1(tmp, q);
2483    }
2484 
2485 #endif
2486 }
2487 
2488 
new_ifft_flipped_notab(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info)2489 void new_ifft_flipped_notab(mint_t* A, const mint_t* a, long k,
2490               const FFTPrimeInfo& info)
2491 
2492 // Performs a high-level IFFT.  Inputs and outputs are in the range [0,q).
2493 // Flipped means inverse roots are used an no truncation and
2494 // no multiplication by 2^{-k}.
2495 // Both A and a should point to arrays of size 2^k,
2496 // and should either be the same or not overlap at all.
2497 // This version does not use precomputed tables.
2498 
2499 {
2500    mint_t q = info.q;
2501 
2502    if (k <= 1) {
2503       if (k == 0) {
2504 	 A[0] = a[0];
2505 	 return;
2506       }
2507       if (k == 1) {
2508          mint_t A0 = AddMod(a[0], a[1], q);
2509          mint_t A1 = SubMod(a[0], a[1], q);
2510          A[0] = A0;
2511          A[1] = A1;
2512 	 return;
2513       }
2514    }
2515 
2516    // assume k > 1
2517    const mint_t *root = info.RootTable[1].elts();
2518    mulmod_t qinv = info.qinv;
2519 
2520    NTL_TLS_GLOBAL_ACCESS(mul_vec);
2521    ComputeMultipliers(mul_vec, k-1, q, qinv, root);
2522 
2523    long n = 1L << k;
2524 
2525    const mint_t *wtab[NTL_FFTMaxRoot+1];
2526    for (long s = 1; s <= k-1; s++) wtab[s] = mul_vec[s].wtab_precomp.elts();
2527 
2528    const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2529    for (long s = 1; s <= k-1; s++) wqinvtab[s] = mul_vec[s].wqinvtab_precomp.elts();
2530 
2531    new_mod_t mod;
2532    mod.q = q;
2533    mod.wtab = &wtab[0];
2534    mod.wqinvtab = &wqinvtab[0];
2535 
2536    mint_t w = info.RootTable[1][k];
2537    mulmod_precon_t wqinv = LazyPrepMulModPrecon(w, q, info.qinv);
2538 
2539    mint_t iw = info.RootTable[0][k];
2540    mulmod_precon_t iwqinv = LazyPrepMulModPrecon(iw, q, info.qinv);
2541 
2542 #ifdef NTL_FFT_USEBUF
2543    NTL_TLS_GLOBAL_ACCESS(AA_store);
2544    AA_store.SetLength(1L << k);
2545    umint_t *AA = AA_store.elts();
2546 
2547    for (long i = 0; i < n; i++) AA[i] = a[i];
2548 
2549 
2550    new_ifft_short1_notab(AA, n, k, mod, w, wqinv, iw, iwqinv);
2551 
2552    for (long i = 0; i < n; i++) {
2553       umint_t tmp = LazyReduce2(AA[i], q);
2554       A[i] = LazyReduce1(tmp, q);
2555    }
2556 #else
2557    umint_t *AA = (umint_t *) A;
2558    if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
2559 
2560    new_ifft_short1_notab(AA, n, k, mod, w, wqinv, iw, iwqinv);
2561 
2562    for (long i = 0; i < n; i++) {
2563       umint_t tmp = LazyReduce2(AA[i], q);
2564       AA[i] = LazyReduce1(tmp, q);
2565    }
2566 #endif
2567 }
2568 
2569 
2570 #ifndef NTL_ENABLE_AVX_FFT
2571 
2572 //================ FFT with tables ==============
2573 
2574 
new_fft(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info,long yn,long xn)2575 void new_fft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
2576              long yn, long xn)
2577 
2578 // Performs a high-level FFT.  Inputs and outputs are in the range [0,q).
2579 // xn and yn are as described above in the truncated FFT interface.
2580 // Both A and a should point to arrays of size 2^k,
2581 // and should either be the same or not overlap at all.
2582 
2583 {
2584    if (!info.bigtab || k > info.bigtab->bound) {
2585       new_fft_notab(A, a, k, info, yn, xn);
2586       return;
2587    }
2588 
2589    mint_t q = info.q;
2590 
2591    if (k <= 1) {
2592       if (k == 0) {
2593 	 A[0] = a[0];
2594 	 return;
2595       }
2596       if (k == 1) {
2597          mint_t A0 = AddMod(a[0], a[1], q);
2598          mint_t A1 = SubMod(a[0], a[1], q);
2599          A[0] = A0;
2600          A[1] = A1;
2601 	 return;
2602       }
2603    }
2604 
2605    // assume k > 1
2606    const mint_t *root = info.RootTable[0].elts();
2607    mulmod_t qinv = info.qinv;
2608    const FFTMultipliers& tab = info.bigtab->MulTab;
2609 
2610    if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
2611 
2612 
2613    long n = 1L << k;
2614 
2615 
2616    const mint_t *wtab[NTL_FFTMaxRoot+1];
2617    for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
2618 
2619    const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2620    for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
2621 
2622    new_mod_t mod;
2623    mod.q = q;
2624    mod.wtab = &wtab[0];
2625    mod.wqinvtab = &wqinvtab[0];
2626 
2627 
2628 
2629 #ifdef NTL_FFT_USEBUF
2630    NTL_TLS_GLOBAL_ACCESS(AA_store);
2631    AA_store.SetLength(1L << k);
2632    umint_t *AA = AA_store.elts();
2633 
2634    for (long i = 0; i < xn; i++) AA[i] = a[i];
2635 
2636    new_fft_short(AA, yn, xn, k, mod);
2637 
2638    for (long i = 0; i < yn; i++) {
2639       A[i] = LazyReduce1(AA[i], q);
2640    }
2641 #else
2642    umint_t *AA = (umint_t *) A;
2643    if (a != A) for (long i = 0; i < xn; i++) AA[i] = a[i];
2644 
2645    new_fft_short(AA, yn, xn, k, mod);
2646 
2647    for (long i = 0; i < yn; i++) {
2648       AA[i] = LazyReduce1(AA[i], q);
2649    }
2650 #endif
2651 
2652 }
2653 
new_fft_flipped(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info)2654 void new_fft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
2655 
2656 // Performs a high-level FFT.  Inputs and outputs are in the range [0,q).
2657 // Both A and a should point to arrays of size 2^k,
2658 // and should either be the same or not overlap at all.
2659 // This version is "flipped" -- it uses inverted roots,
2660 // multiplies by 2^{-k}, and performs no truncations.
2661 
2662 {
2663    if (!info.bigtab || k > info.bigtab->bound) {
2664       new_fft_flipped_notab(A, a, k, info);
2665       return;
2666    }
2667 
2668    mint_t q = info.q;
2669 
2670    if (k <= 1) {
2671       if (k == 0) {
2672 	 A[0] = a[0];
2673 	 return;
2674       }
2675       if (k == 1) {
2676          mint_t two_inv = info.TwoInvTable[1];
2677          mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
2678          mint_t A0 = AddMod(a[0], a[1], q);
2679          mint_t A1 = SubMod(a[0], a[1], q);
2680          A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
2681          A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
2682 	 return;
2683       }
2684    }
2685 
2686    // assume k > 1
2687    const mint_t *root = info.RootTable[0].elts();
2688    mulmod_t qinv = info.qinv;
2689    const FFTMultipliers& tab = info.bigtab->MulTab;
2690 
2691    if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
2692 
2693 
2694    long n = 1L << k;
2695 
2696 
2697    const mint_t *wtab[NTL_FFTMaxRoot+1];
2698    for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
2699 
2700    const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2701    for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
2702 
2703    new_mod_t mod;
2704    mod.q = q;
2705    mod.wtab = &wtab[0];
2706    mod.wqinvtab = &wqinvtab[0];
2707 
2708    mint_t two_inv = info.TwoInvTable[k];
2709    mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
2710 
2711 
2712 #ifdef NTL_FFT_USEBUF
2713    NTL_TLS_GLOBAL_ACCESS(AA_store);
2714    AA_store.SetLength(1L << k);
2715    umint_t *AA = AA_store.elts();
2716 
2717    for (long i = 0; i < n; i++) AA[i] = a[i];
2718 
2719    new_fft_short_flipped(AA, k, mod);
2720 
2721    for (long i = 0; i < n; i++) {
2722       umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2723       A[i] = LazyReduce1(tmp, q);
2724    }
2725 #else
2726    umint_t *AA = (umint_t *) A;
2727    if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
2728 
2729    new_fft_short_flipped(AA, k, mod);
2730 
2731    for (long i = 0; i < n; i++) {
2732       umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2733       AA[i] = LazyReduce1(tmp, q);
2734    }
2735 #endif
2736 }
2737 
2738 //=======  Inverse FFT with tables ==============
2739 
2740 
new_ifft(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info,long yn)2741 void new_ifft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
2742               long yn)
2743 
2744 // Performs a high-level IFFT.  Inputs and outputs are in the range [0,q).
2745 // yn==xn are as described above in the truncated FFT interface.
2746 // Both A and a should point to arrays of size 2^k,
2747 // and should either be the same or not overlap at all.
2748 // Multiples by 2^{-k}.
2749 
2750 {
2751    if (!info.bigtab || k > info.bigtab->bound) {
2752       new_ifft_notab(A, a, k, info, yn);
2753       return;
2754    }
2755 
2756    mint_t q = info.q;
2757 
2758    if (k <= 1) {
2759       if (k == 0) {
2760 	 A[0] = a[0];
2761 	 return;
2762       }
2763       if (k == 1) {
2764          mint_t two_inv = info.TwoInvTable[1];
2765          mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[1];
2766          mint_t A0 = AddMod(a[0], a[1], q);
2767          mint_t A1 = SubMod(a[0], a[1], q);
2768          A[0] = LazyReduce1(LazyMulModPrecon(A0, two_inv, q, two_inv_aux), q);
2769          A[1] = LazyReduce1(LazyMulModPrecon(A1, two_inv, q, two_inv_aux), q);
2770 	 return;
2771       }
2772    }
2773 
2774    // assume k > 1
2775    const mint_t *root = info.RootTable[0].elts();
2776    mulmod_t qinv = info.qinv;
2777    const FFTMultipliers& tab = info.bigtab->MulTab;
2778 
2779    if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
2780 
2781 
2782    long n = 1L << k;
2783 
2784 
2785    const mint_t *wtab[NTL_FFTMaxRoot+1];
2786    for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
2787 
2788    const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2789    for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
2790 
2791    new_mod_t mod;
2792    mod.q = q;
2793    mod.wtab = &wtab[0];
2794    mod.wqinvtab = &wqinvtab[0];
2795 
2796    mint_t two_inv = info.TwoInvTable[k];
2797    mulmod_precon_t two_inv_aux = info.TwoInvPreconTable[k];
2798 
2799 #ifdef NTL_FFT_USEBUF
2800    NTL_TLS_GLOBAL_ACCESS(AA_store);
2801    AA_store.SetLength(1L << k);
2802    umint_t *AA = AA_store.elts();
2803 
2804    for (long i = 0; i < yn; i++) AA[i] = a[i];
2805 
2806    new_ifft_short1(AA, yn, k, mod);
2807 
2808    for (long i = 0; i < yn; i++) {
2809       umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2810       A[i] = LazyReduce1(tmp, q);
2811    }
2812 #else
2813    umint_t *AA = (umint_t *) A;
2814    if (a != A) for (long i = 0; i < yn; i++) AA[i] = a[i];
2815 
2816    new_ifft_short1(AA, yn, k, mod);
2817 
2818    for (long i = 0; i < yn; i++) {
2819       umint_t tmp = LazyMulModPrecon(AA[i], two_inv, q, two_inv_aux);
2820       AA[i] = LazyReduce1(tmp, q);
2821    }
2822 #endif
2823 }
2824 
2825 
new_ifft_flipped(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info)2826 void new_ifft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
2827 
2828 
2829 // Performs a high-level IFFT.  Inputs and outputs are in the range [0,q).
2830 // Flipped means inverse roots are used an no truncation and
2831 // no multiplication by 2^{-k}.
2832 // Both A and a should point to arrays of size 2^k,
2833 // and should either be the same or not overlap at all.
2834 
2835 
2836 {
2837    if (!info.bigtab || k > info.bigtab->bound) {
2838       new_ifft_flipped_notab(A, a, k, info);
2839       return;
2840    }
2841 
2842    mint_t q = info.q;
2843 
2844    if (k <= 1) {
2845       if (k == 0) {
2846 	 A[0] = a[0];
2847 	 return;
2848       }
2849       if (k == 1) {
2850          mint_t A0 = AddMod(a[0], a[1], q);
2851          mint_t A1 = SubMod(a[0], a[1], q);
2852          A[0] = A0;
2853          A[1] = A1;
2854 	 return;
2855       }
2856    }
2857 
2858    // assume k > 1
2859    const mint_t *root = info.RootTable[0].elts();
2860    mulmod_t qinv = info.qinv;
2861    const FFTMultipliers& tab = info.bigtab->MulTab;
2862 
2863    if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
2864 
2865 
2866    long n = 1L << k;
2867 
2868 
2869    const mint_t *wtab[NTL_FFTMaxRoot+1];
2870    for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
2871 
2872    const mulmod_precon_t *wqinvtab[NTL_FFTMaxRoot+1];
2873    for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
2874 
2875    new_mod_t mod;
2876    mod.q = q;
2877    mod.wtab = &wtab[0];
2878    mod.wqinvtab = &wqinvtab[0];
2879 
2880 
2881 #ifdef NTL_FFT_USEBUF
2882    NTL_TLS_GLOBAL_ACCESS(AA_store);
2883    AA_store.SetLength(1L << k);
2884    umint_t *AA = AA_store.elts();
2885 
2886    for (long i = 0; i < n; i++) AA[i] = a[i];
2887 
2888    new_ifft_short1_flipped(AA, k, mod);
2889 
2890    for (long i = 0; i < n; i++) {
2891       umint_t tmp = LazyReduce2(AA[i], q);
2892       A[i] = LazyReduce1(tmp, q);
2893    }
2894 #else
2895    umint_t *AA = (umint_t *) A;
2896    if (a != A) for (long i = 0; i < n; i++) AA[i] = a[i];
2897 
2898    new_ifft_short1_flipped(AA, k, mod);
2899 
2900    for (long i = 0; i < n; i++) {
2901       umint_t tmp = LazyReduce2(AA[i], q);
2902       AA[i] = LazyReduce1(tmp, q);
2903    }
2904 #endif
2905 }
2906 
2907 #endif
2908 
2909 //===============================================
2910 
InitFFTPrimeInfo(FFTPrimeInfo & info,long q,long w,long bigtab_index)2911 void InitFFTPrimeInfo(FFTPrimeInfo& info, long q, long w, long bigtab_index)
2912 {
2913    mulmod_t qinv = PrepMulMod(q);
2914 
2915    long mr = CalcMaxRoot(q);
2916 
2917    info.q = q;
2918    info.qinv = qinv;
2919    info.qrecip = 1/double(q);
2920    info.zz_p_context = 0;
2921 
2922 
2923    info.RootTable[0].SetLength(mr+1);
2924    info.RootTable[1].SetLength(mr+1);
2925    info.TwoInvTable.SetLength(mr+1);
2926    info.TwoInvPreconTable.SetLength(mr+1);
2927 
2928    long *rt = &info.RootTable[0][0];
2929    long *rit = &info.RootTable[1][0];
2930    long *tit = &info.TwoInvTable[0];
2931    mulmod_precon_t *tipt = &info.TwoInvPreconTable[0];
2932 
2933    long j;
2934    long t;
2935 
2936    rt[mr] = w;
2937    for (j = mr-1; j >= 0; j--)
2938       rt[j] = MulMod(rt[j+1], rt[j+1], q);
2939 
2940    rit[mr] = InvMod(w, q);
2941    for (j = mr-1; j >= 0; j--)
2942       rit[j] = MulMod(rit[j+1], rit[j+1], q);
2943 
2944    t = InvMod(2, q);
2945    tit[0] = 1;
2946    for (j = 1; j <= mr; j++)
2947       tit[j] = MulMod(tit[j-1], t, q);
2948 
2949    for (j = 0; j <= mr; j++)
2950       tipt[j] = LazyPrepMulModPrecon(tit[j], q, qinv);
2951 
2952 #ifndef NTL_ENABLE_AVX_FFT
2953    if (bigtab_index != -1) {
2954       long bound = NTL_FFT_BIGTAB_MAXROOT-bigtab_index/NTL_FFT_BIGTAB_LIMIT;
2955       if (bound > NTL_FFT_BIGTAB_MINROOT) {
2956          info.bigtab.make();
2957          info.bigtab->bound = bound;
2958       }
2959    }
2960 #else
2961    // with the AVX implementation, we unconditionally use tables
2962    info.bigtab.make();
2963 #endif
2964 }
2965 
2966 
2967 //===================================================================
2968 
2969 #ifdef NTL_ENABLE_AVX_FFT
2970 
2971 static void
pd_LazyPrepMulModPrecon(double * bninv,const double * b,double n,long len)2972 pd_LazyPrepMulModPrecon(double *bninv, const double *b, double n, long len)
2973 {
2974    CSRPush push;
2975    pd_LazyPrepMulModPrecon_impl(bninv, b, n, len);
2976 }
2977 
2978 static
LazyPrecompFFTMultipliers(long k,mint_t q,mulmod_t qinv,const mint_t * root,const pd_FFTMultipliers & tab)2979 void LazyPrecompFFTMultipliers(long k, mint_t q, mulmod_t qinv, const mint_t *root, const pd_FFTMultipliers& tab)
2980 {
2981    if (k < 1) LogicError("LazyPrecompFFTMultipliers: bad input");
2982 
2983    do { // NOTE: thread safe lazy init
2984       pd_FFTMultipliers::Builder bld(tab, k+1);
2985       long amt = bld.amt();
2986       if (!amt) break;
2987 
2988       long first = k+1-amt;
2989       // initialize entries first..k
2990 
2991 
2992       for (long s = first; s <= k; s++) {
2993          UniquePtr<pd_FFTVectorPair> item;
2994 
2995          if (s == 0) {
2996             bld.move(item); // position 0 not used
2997             continue;
2998          }
2999 
3000          long m = 1L << s;
3001          long m_half = 1L << (s-1);
3002 
3003          item.make();
3004          item->wtab_precomp.SetLength(m_half);
3005          item->wqinvtab_precomp.SetLength(m_half);
3006 
3007          double *wtab = item->wtab_precomp.elts();
3008          double *wqinvtab = item->wqinvtab_precomp.elts();
3009 
3010          mint_t w = root[s];
3011          mulmod_precon_t wqinv = PrepMulModPrecon(w, q, qinv);
3012 
3013          mint_t wi = 1;
3014          wtab[0] = wi;
3015          for (long i = 1; i < m_half; i++) {
3016             wi = MulModPrecon(wi, w, q, wqinv);
3017             wtab[i] = wi;
3018          }
3019          pd_LazyPrepMulModPrecon(wqinvtab, wtab, q, m_half);
3020 
3021          bld.move(item);
3022       }
3023    } while (0);
3024 }
3025 
3026 NTL_TLS_GLOBAL_DECL(AlignedArray<double>, pd_AA_store)
3027 static NTL_CHEAP_THREAD_LOCAL long pd_AA_store_len = 0;
3028 
3029 
3030 #define PD_MIN_K (NTL_LG2_PDSZ+3)
3031 // k must be at least PD_MIN_K
3032 
new_fft(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info,long yn,long xn)3033 void new_fft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
3034             long yn, long xn)
3035 {
3036    if (k < PD_MIN_K) {
3037       new_fft_notab(A, a, k, info, yn, xn);
3038       return;
3039    }
3040 
3041    long dir = 0;
3042 
3043    mint_t q = info.q;
3044    const mint_t *root = info.RootTable[dir].elts();
3045    mulmod_t qinv = info.qinv;
3046    const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[dir];
3047 
3048    if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
3049 
3050    const double *wtab[NTL_FFTMaxRoot+1];
3051    for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
3052 
3053    const double *wqinvtab[NTL_FFTMaxRoot+1];
3054    for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
3055 
3056    pd_mod_t mod;
3057    mod.q = q;
3058    mod.wtab = &wtab[0];
3059    mod.wqinvtab = &wqinvtab[0];
3060 
3061    long n = 1L << k;
3062 
3063    NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
3064    if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
3065    double *AA = pd_AA_store.elts();
3066 
3067    CSRPush push;
3068    pd_fft_trunc_impl(A, a, AA, k, mod, yn, xn);
3069 }
3070 
3071 
3072 
new_fft_flipped(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info)3073 void new_fft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
3074 {
3075    if (k < PD_MIN_K) {
3076       new_fft_flipped_notab(A, a, k, info);
3077       return;
3078    }
3079 
3080    long dir = 1;
3081 
3082    mint_t q = info.q;
3083    const mint_t *root = info.RootTable[dir].elts();
3084    mulmod_t qinv = info.qinv;
3085    const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[dir];
3086 
3087    if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
3088 
3089    const double *wtab[NTL_FFTMaxRoot+1];
3090    for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
3091 
3092    const double *wqinvtab[NTL_FFTMaxRoot+1];
3093    for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
3094 
3095    pd_mod_t mod;
3096    mod.q = q;
3097    mod.wtab = &wtab[0];
3098    mod.wqinvtab = &wqinvtab[0];
3099 
3100    long n = 1L << k;
3101 
3102    NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
3103    if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
3104    double *AA = pd_AA_store.elts();
3105 
3106    CSRPush push;
3107    pd_fft_trunc_impl(A, a, AA, k, mod, n, n, info.TwoInvTable[k]);
3108 }
3109 
3110 
new_ifft(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info,long yn)3111 void new_ifft(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info,
3112             long yn)
3113 {
3114    if (k < PD_MIN_K) {
3115       new_ifft_notab(A, a, k, info, yn);
3116       return;
3117    }
3118 
3119    long dir = 0;
3120 
3121    mint_t q = info.q;
3122    const mint_t *root = info.RootTable[1-dir].elts();
3123    const mint_t *root1 = info.RootTable[dir].elts();
3124    mulmod_t qinv = info.qinv;
3125    const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[1-dir];
3126    const pd_FFTMultipliers& tab1 = info.bigtab->pd_MulTab[dir];
3127 
3128    if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
3129    if (k >= tab1.length()) LazyPrecompFFTMultipliers(k, q, qinv, root1, tab1);
3130 
3131    const double *wtab[NTL_FFTMaxRoot+1];
3132    for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
3133 
3134    const double *wqinvtab[NTL_FFTMaxRoot+1];
3135    for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
3136 
3137    const double *wtab1[NTL_FFTMaxRoot+1];
3138    for (long s = 1; s <= k; s++) wtab1[s] = tab1[s]->wtab_precomp.elts();
3139 
3140    const double *wqinvtab1[NTL_FFTMaxRoot+1];
3141    for (long s = 1; s <= k; s++) wqinvtab1[s] = tab1[s]->wqinvtab_precomp.elts();
3142 
3143    pd_mod_t mod;
3144    mod.q = q;
3145    mod.wtab = &wtab[0];
3146    mod.wqinvtab = &wqinvtab[0];
3147    mod.wtab1 = &wtab1[0];
3148    mod.wqinvtab1 = &wqinvtab1[0];
3149 
3150    long n = 1L << k;
3151 
3152    NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
3153    if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
3154    double *AA = pd_AA_store.elts();
3155 
3156    CSRPush push;
3157    pd_ifft_trunc_impl(A, a, AA, k, mod, yn, info.TwoInvTable[k]);
3158 }
3159 
3160 
new_ifft_flipped(mint_t * A,const mint_t * a,long k,const FFTPrimeInfo & info)3161 void new_ifft_flipped(mint_t* A, const mint_t* a, long k, const FFTPrimeInfo& info)
3162 {
3163    if (k < PD_MIN_K) {
3164       new_ifft_flipped_notab(A, a, k, info);
3165       return;
3166    }
3167 
3168    long dir = 1;
3169 
3170    mint_t q = info.q;
3171    const mint_t *root = info.RootTable[1-dir].elts();
3172    const mint_t *root1 = info.RootTable[dir].elts();
3173    mulmod_t qinv = info.qinv;
3174    const pd_FFTMultipliers& tab = info.bigtab->pd_MulTab[1-dir];
3175    const pd_FFTMultipliers& tab1 = info.bigtab->pd_MulTab[dir];
3176 
3177    if (k >= tab.length()) LazyPrecompFFTMultipliers(k, q, qinv, root, tab);
3178    if (k >= tab1.length()) LazyPrecompFFTMultipliers(k, q, qinv, root1, tab1);
3179 
3180    const double *wtab[NTL_FFTMaxRoot+1];
3181    for (long s = 1; s <= k; s++) wtab[s] = tab[s]->wtab_precomp.elts();
3182 
3183    const double *wqinvtab[NTL_FFTMaxRoot+1];
3184    for (long s = 1; s <= k; s++) wqinvtab[s] = tab[s]->wqinvtab_precomp.elts();
3185 
3186    const double *wtab1[NTL_FFTMaxRoot+1];
3187    for (long s = 1; s <= k; s++) wtab1[s] = tab1[s]->wtab_precomp.elts();
3188 
3189    const double *wqinvtab1[NTL_FFTMaxRoot+1];
3190    for (long s = 1; s <= k; s++) wqinvtab1[s] = tab1[s]->wqinvtab_precomp.elts();
3191 
3192    pd_mod_t mod;
3193    mod.q = q;
3194    mod.wtab = &wtab[0];
3195    mod.wqinvtab = &wqinvtab[0];
3196    mod.wtab1 = &wtab1[0];
3197    mod.wqinvtab1 = &wqinvtab1[0];
3198 
3199    long n = 1L << k;
3200 
3201    NTL_TLS_GLOBAL_ACCESS(pd_AA_store);
3202    if (pd_AA_store_len < n) pd_AA_store.SetLength(n);
3203    double *AA = pd_AA_store.elts();
3204 
3205    CSRPush push;
3206    pd_ifft_trunc_impl(A, a, AA, k, mod, n);
3207 }
3208 
3209 #endif
3210 
3211 
3212 NTL_END_IMPL
3213