1 /*
2    zn_poly_internal.h:  main header file #included internally by zn_poly
3                         modules
4 
5    Copyright (C) 2007, 2008, David Harvey
6 
7    This file is part of the zn_poly library (version 0.9).
8 
9    This program is free software: you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation, either version 2 of the License, or
12    (at your option) version 3 of the License.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program.  If not, see <http://www.gnu.org/licenses/>.
21 
22 */
23 
24 
25 /*
26 
27    IMPORTANT NOTE!!!!!!
28 
29    Everything in this file is internal, and may change incompatibly in
30    future versions of zn_poly. You have been warned!
31 
32 */
33 
34 
35 #ifndef ZN_POLY_INTERNAL_H
36 #define ZN_POLY_INTERNAL_H
37 
38 #include <gmp.h>
39 #include <stddef.h>
40 #include <stdio.h>
41 #include "zn_poly.h"
42 
43 
44 #ifdef __cplusplus
45 extern "C" {
46 #endif
47 
48 
49 
50 #ifdef ZNP_USE_FLINT
51 
52 // use FLINT integer multiplication
53 #include <FLINT/mpn_extras.h>
54 #define ZNP_mpn_mul F_mpn_mul
55 
56 #else
57 
58 // use GMP integer multiplication
59 #define ZNP_mpn_mul mpn_mul
60 
61 #endif
62 
63 
64 
65 /*
66    Executes "stuff", and checks that it returns 0. Usage is e.g.:
67 
68       ZNP_ASSERT_NOCARRY (mpn_add_n (z, x, y, 42));
69 
70    This is used where we want the test suite to check that the return value
71    is zero, but the production version should skip the check, and we don't
72    want to stuff around with temporaries everywhere.
73 */
74 #define ZNP_ASSERT_NOCARRY(stuff)   \
75     do {                            \
76        mp_limb_t __xyz_cy;          \
77        __xyz_cy = (stuff);          \
78        ZNP_ASSERT (__xyz_cy == 0);  \
79     } while (0)
80 
81 
82 /*
83    For integers a >= 1 and b >= 1, returns ceil(a / b).
84 */
85 #define CEIL_DIV(a, b) ((((a) - 1) / (b)) + 1)
86 
87 
88 /*
89    For integers a >= 1 and 0 <= r < ULONG BITS, returns ceil(a / 2^r).
90 */
91 #define CEIL_DIV_2EXP(a, r) ((((a) - 1) >> (r)) + 1)
92 
93 
94 #define ZNP_MIN(aaa, bbb) (((aaa) < (bbb)) ? (aaa) : (bbb))
95 #define ZNP_MAX(aaa, bbb) (((aaa) > (bbb)) ? (aaa) : (bbb))
96 
97 
98 /*
99    Estimate of the L1 cache size, in bytes.
100    If this is a bit on the small side, it's probably not a big deal.
101    If it's on the big side, that might start to seriously degrade performance.
102 */
103 #define ZNP_CACHE_SIZE 32768
104 
105 
106 /*
107    Returns ceil(log2(x)).
108    x must be >= 1.
109 */
110 #define ceil_lg \
111     ZNP_ceil_lg
112 int ceil_lg (ulong x);
113 
114 
115 /*
116    Returns floor(log2(x)).
117    Returns -1 for x == 0.
118 */
119 #define floor_lg \
120     ZNP_floor_lg
121 int
122 floor_lg (ulong x);
123 
124 
125 /*
126    res := abs(op1 - op2).
127 
128    Returns 1 if op1 - op2 is negative, else zero.
129 */
130 #define signed_mpn_sub_n \
131     ZNP_signed_mpn_sub_n
132 ZNP_INLINE int
signed_mpn_sub_n(mp_limb_t * res,const mp_limb_t * op1,const mp_limb_t * op2,size_t n)133 signed_mpn_sub_n (mp_limb_t* res, const mp_limb_t* op1, const mp_limb_t* op2,
134                   size_t n)
135 {
136    if (mpn_cmp (op1, op2, n) >= 0)
137    {
138       mpn_sub_n (res, op1, op2, n);
139       return 0;
140    }
141    else
142    {
143       mpn_sub_n (res, op2, op1, n);
144       return 1;
145    }
146 }
147 
148 
149 /*
150    The ZNP_FASTALLOC and ZNP_FASTFREE macros are used for allocating memory
151    which is taken off the stack if the request is small enough, or off the
152    heap if not.
153 
154    Example usage:
155 
156    ZNP_FASTALLOC (stuff, int, 100, n);
157 
158    This does two things. It allocates an array of 100 ints on the stack.
159    It also declares a pointer "int* stuff", which points to a block of ints
160    of length n. If n <= 100, the block will be the one just allocated on the
161    stack. If n > 100, the block will be found using malloc.
162 
163    Then afterwards, you need to do:
164 
165    ZNP_FASTFREE (stuff);
166 
167    This will call free() if the block was originally taken off the heap.
168 */
169 
170 #define ZNP_FASTALLOC(ptr, type, reserve, request)        \
171    size_t __FASTALLOC_request_##ptr = (request);          \
172    type* ptr;                                             \
173    type __FASTALLOC_##ptr [reserve];                      \
174    if (__FASTALLOC_request_##ptr <= (reserve))            \
175       ptr = __FASTALLOC_##ptr;                            \
176    else                                                   \
177       ptr = (type*) malloc (sizeof (type) * __FASTALLOC_request_##ptr);
178 
179 
180 #define ZNP_FASTFREE(ptr)                                 \
181    if (ptr != __FASTALLOC_##ptr)                          \
182       free (ptr);
183 
184 
185 
186 extern size_t
187 ZNP_mpn_smp_kara_thresh;
188 
189 extern size_t
190 ZNP_mpn_mulmid_fallback_thresh;
191 
192 
193 /*
194    Stores tuning data for moduli of a specific bitsize.
195 */
196 #define tuning_info_t \
197     ZNP_tuning_info_t
198 typedef struct
199 {
200    // thresholds for array multiplication
201    size_t mul_KS2_thresh;    // KS1 -> KS2 threshold
202    size_t mul_KS4_thresh;    // KS2 -> KS4 threshold
203    size_t mul_fft_thresh;    // KS4 -> fft threshold
204 
205    // as above, but for squaring
206    size_t sqr_KS2_thresh;
207    size_t sqr_KS4_thresh;
208    size_t sqr_fft_thresh;
209 
210    // as above, but for middle products
211    size_t mulmid_KS2_thresh;
212    size_t mulmid_KS4_thresh;
213    size_t mulmid_fft_thresh;
214 
215    // for negacyclic multiplications, switch from KS to Nussbaumer FFT
216    // when length reaches 2^nuss_mul_thresh
217    unsigned nuss_mul_thresh;
218    // ditto for nussbaumer squaring
219    unsigned nuss_sqr_thresh;
220 
221 }
222 tuning_info_t;
223 
224 
225 /*
226    Global array of tuning_info_t's, one for each bitsize.
227 */
228 #define tuning_info \
229     ZNP_tuning_info
230 extern tuning_info_t tuning_info[];
231 
232 
233 
234 /* ============================================================================
235 
236      stuff from pack.c
237 
238 ============================================================================ */
239 
240 /*
241    Computes
242 
243       res := 2^k * ( op[0] + op[s]*2^b + ... + op[(n-1)*s]*2^((n-1)*b) ).
244 
245    Assumes each op[i] satisfies 0 <= op[i] < 2^b.
246 
247    Must have 0 < b < 3 * ULONG_BITS.
248 
249    If r == 0, then exactly
250        ceil((k + n * b) / GMP_NUMB_BITS)
251    limbs are written. Otherwise, the output will be zero-padded up to exactly
252    r limbs, which must be at least the above number of limbs.
253 
254 */
255 #define zn_array_pack \
256     ZNP_zn_array_pack
257 void
258 zn_array_pack (mp_limb_t* res, const ulong* op, size_t n, ptrdiff_t s,
259                unsigned b, unsigned k, size_t r);
260 
261 
262 /*
263    Let op be an integer of the form
264 
265       2^k * (a[0] + a[1]*2^b + ... + a[n-1]*2^((n-1)*b)) + junk,
266 
267    where 0 <= a[i] < 2^b for each i, and where 0 <= junk < 2^k.
268 
269    This function reads off the a[i]'s and stores them at res. Each output
270    coefficient occupies exactly ceil(b / ULONG_BITS) words. The input
271    should be exactly ceil((k + n * b) / GMP_NUMB_BITS) limbs long.
272 
273    Must have 0 < b < 3 * ULONG_BITS.
274 */
275 #define zn_array_unpack \
276     ZNP_zn_array_unpack
277 void
278 zn_array_unpack (ulong* res, const mp_limb_t* op, size_t n, unsigned b,
279                  unsigned k);
280 
281 
282 /*
283    Same as zn_array_unpack, but adds an assertion to check that the unpacking
284    routine will not read beyond the first r limbs of op.
285 */
286 #define zn_array_unpack_SAFE(res, op, n, b, k, r)                     \
287 do                                                                    \
288 {                                                                     \
289    ZNP_ASSERT((n) * (b) + (k) <= (r) * GMP_NUMB_BITS);                \
290    zn_array_unpack(res, op, n, b, k);                                 \
291 } while (0)
292 
293 
294 
295 /* ============================================================================
296 
297      stuff from mul.c
298 
299 ============================================================================ */
300 
301 /*
302    Identical to zn_array_mul(), except for the fastred flag.
303 
304    If fastred is cleared, the output is the same as for zn_array_mul().
305 
306    If fastred is set, the routine uses the fastest modular reduction strategy
307    available for the given parameters. The result will come out divided by a
308    fudge factor, which can be recovered via _zn_array_mul_fudge().
309 */
310 #define _zn_array_mul \
311     ZNP__zn_array_mul
312 void
313 _zn_array_mul (ulong* res,
314                const ulong* op1, size_t n1,
315                const ulong* op2, size_t n2,
316                int fastred, const zn_mod_t mod);
317 
318 #define _zn_array_mul_fudge \
319     ZNP__zn_array_mul_fudge
320 ulong
321 _zn_array_mul_fudge (size_t n1, size_t n2, int sqr, const zn_mod_t mod);
322 
323 
324 
325 /* ============================================================================
326 
327      stuff from ks_support.c
328 
329 ============================================================================ */
330 
331 
332 /*
333    Sets res[i * s] = reduction modulo mod of the i-th entry of op,
334    for 0 <= i < n. Each entry of op is w ulongs.
335 
336    If the redc flag is set, the results are divided by -B mod m
337    (only allowed if the modulus is odd).
338 
339    Must have 1 <= w <= 3.
340 */
341 #define array_reduce \
342     ZNP_array_reduce
343 void
344 array_reduce (ulong* res, ptrdiff_t s, const ulong* op, size_t n, unsigned w,
345               int redc, const zn_mod_t mod);
346 
347 
348 
349 /*
350    This is a helper function for the variants of KS that evaluate at
351    "reciprocal" evaluation points like 2^(-N); it implements essentially the
352    algorithm of section 3.2 of [Har07], plus reductions mod n.
353 
354    It accepts two integers X and Y written in base M = 2^b, where
355    1 <= b <= 3 * ULONG_BITS / 2. It assumes that
356 
357       X = a[0] + a[1]*M + ... + a[n-1]*M^(n-1),
358       Y = a[n-1] + a[n-2]*M + ... + a[0]*M^(n-1),
359 
360    where each a[i] is two "digits" long, and where the high digit of a[i]
361    is at most M-2 (i.e. may not equal M-1). It reconstructs the a[i],
362    reduces them mod m, and stores the results in an array.
363 
364    The input is supplied as follows. X is in op1, Y is in op2. They are both
365    arrays of values that are b bits wide, where b <= 3 * ULONG_BITS / 2.
366    Each value takes up one ulong if b <= ULONG_BITS, otherwise two ulongs.
367    There are n + 1 such values in each array (i.e. each array consists of
368    (n + 1) * ceil(b / ULONG_BITS) ulongs).
369 
370    The output (n ulongs) is written to the array res, with consecutive
371    outputs separated by s ulongs.
372 
373    mod describes the modulus m.
374 
375    If the redc flag is set, the modular reductions are performed using REDC,
376    i.e. the result contain an extra factor of -1/B mod m (where
377    B = 2^ULONG_BITS).
378 */
379 
380 #define zn_array_recover_reduce \
381     ZNP_zn_array_recover_reduce
382 void
383 zn_array_recover_reduce (ulong* res, ptrdiff_t s, const ulong* op1,
384                          const ulong* op2, size_t n, unsigned b, int redc,
385                          const zn_mod_t mod);
386 
387 
388 
389 /* ============================================================================
390 
391      stuff from mul_ks.c
392 
393 ============================================================================ */
394 
395 /*
396    These are the same as zn_array_mul(). They use four different types of
397    Kronecker substitution. They automatically use a faster algorithm for
398    squaring (if the inputs are identical buffers).
399 
400    Aliasing of all operands allowed.
401 
402    Must have n1 >= n2 >= 1.
403 
404    If the redc flag is set, the outputs will be divided by -B mod m.
405    (Only allowed if the modulus is odd.)
406 */
407 #define zn_array_mul_KS1 \
408     ZNP_zn_array_mul_KS1
409 void
410 zn_array_mul_KS1 (ulong* res,
411                   const ulong* op1, size_t n1,
412                   const ulong* op2, size_t n2,
413                   int redc, const zn_mod_t mod);
414 
415 #define zn_array_mul_KS2 \
416     ZNP_zn_array_mul_KS2
417 void
418 zn_array_mul_KS2 (ulong* res,
419                   const ulong* op1, size_t n1,
420                   const ulong* op2, size_t n2,
421                   int redc, const zn_mod_t mod);
422 
423 #define zn_array_mul_KS3 \
424     ZNP_zn_array_mul_KS3
425 void
426 zn_array_mul_KS3 (ulong* res,
427                   const ulong* op1, size_t n1,
428                   const ulong* op2, size_t n2,
429                   int redc, const zn_mod_t mod);
430 
431 #define zn_array_mul_KS4 \
432     ZNP_zn_array_mul_KS4
433 void
434 zn_array_mul_KS4 (ulong* res,
435                   const ulong* op1, size_t n1,
436                   const ulong* op2, size_t n2,
437                   int redc, const zn_mod_t mod);
438 
439 
440 /* ============================================================================
441 
442      stuff from mulmid_ks.c
443 
444 ============================================================================ */
445 
446 
447 /*
448    These are the same as zn_array_mulmid(). They use four different types of
449    Kronecker substitution.
450 
451    Aliasing of all operands allowed.
452 
453    Must have n1 >= n2 >= 1.
454 
455    If the redc flag is set, the outputs will be divided by -B mod m.
456    (Only allowed if the modulus is odd.)
457 */
458 #define zn_array_mulmid_KS1 \
459     ZNP_zn_array_mulmid_KS1
460 void
461 zn_array_mulmid_KS1 (ulong* res,
462                      const ulong* op1, size_t n1,
463                      const ulong* op2, size_t n2,
464                      int redc, const zn_mod_t mod);
465 
466 #define zn_array_mulmid_KS2 \
467     ZNP_zn_array_mulmid_KS2
468 void
469 zn_array_mulmid_KS2 (ulong* res,
470                      const ulong* op1, size_t n1,
471                      const ulong* op2, size_t n2,
472                      int redc, const zn_mod_t mod);
473 
474 #define zn_array_mulmid_KS3 \
475     ZNP_zn_array_mulmid_KS3
476 void
477 zn_array_mulmid_KS3 (ulong* res,
478                      const ulong* op1, size_t n1,
479                      const ulong* op2, size_t n2,
480                      int redc, const zn_mod_t mod);
481 
482 #define zn_array_mulmid_KS4 \
483     ZNP_zn_array_mulmid_KS4
484 void
485 zn_array_mulmid_KS4 (ulong* res,
486                      const ulong* op1, size_t n1,
487                      const ulong* op2, size_t n2,
488                      int redc, const zn_mod_t mod);
489 
490 
491 
492 /* ============================================================================
493 
494      pmf_t stuff
495 
496 ============================================================================ */
497 
498 
499 /*
500    Let R = Z/mZ. A pmf_t ("pmf" = "polynomial modulo fermat") represents an
501    element of S = R[Y]/(Y^M + 1). This is used as the coefficient ring in
502    the Schonhage/Nussbaumer FFT routines.
503 
504    It's an array of ulongs of length M + 1, where M = 2^lgM is a power of two.
505    (The value M is not stored, the caller needs to keep track of it.)
506 
507    The first value in the array is an integer b called the "bias" of the
508    representation.
509 
510    The remaining M values are coefficients a_0, ..., a_{M-1}.
511 
512    These together represent the polynomial
513 
514      Y^b (a_0 + a_1 Y + ... + a_{M-1} Y^{M-1}).
515 
516    Note that elements of S do not have a unique representation in this form;
517    in fact they have one possible representation for each value of b in
518    [0, 2M). (By allowing nonzero bias, we get more efficient in-place FFT
519    butterflies.) The stored bias value need not be in [0, 2M), but it is
520    interpreted mod 2M.
521 
522    Currently the values a_i are always normalised into [0, m). Later we might
523    drop that restriction to obtain faster butterflies...
524 
525 */
526 #define pmf_t \
527     ZNP_pmf_t
528 typedef ulong*  pmf_t;
529 
530 #define pmf_const_t \
531     ZNP_pmf_const_t
532 typedef const ulong*  pmf_const_t;
533 
534 
535 /*
536    op := op * x
537 */
538 #define pmf_scalar_mul \
539     ZNP_pmf_scalar_mul
540 ZNP_INLINE void
pmf_scalar_mul(pmf_t op,ulong M,ulong x,const zn_mod_t mod)541 pmf_scalar_mul (pmf_t op, ulong M, ulong x, const zn_mod_t mod)
542 {
543    zn_array_scalar_mul (op + 1, op + 1, M, x, mod);
544 }
545 
546 
547 /*
548    op := 0, with bias reset to zero too
549 */
550 #define pmf_zero \
551     ZNP_pmf_zero
552 ZNP_INLINE void
pmf_zero(pmf_t op,ulong M)553 pmf_zero (pmf_t op, ulong M)
554 {
555    for (M++; M > 0; M--)
556       *op++ = 0;
557 }
558 
559 
560 /*
561    op := op / 2
562 
563    Modulus must be odd.
564 */
565 #define pmf_divby2 \
566     ZNP_pmf_divby2
567 ZNP_INLINE void
pmf_divby2(pmf_t op,ulong M,const zn_mod_t mod)568 pmf_divby2 (pmf_t op, ulong M, const zn_mod_t mod)
569 {
570    ZNP_ASSERT (mod->m & 1);
571 
572    for (op++; M > 0; M--, op++)
573       *op = zn_mod_divby2 (*op, mod);
574 }
575 
576 
577 /*
578    res := op
579 */
580 #define pmf_set \
581     ZNP_pmf_set
582 ZNP_INLINE void
pmf_set(pmf_t res,pmf_t op,ulong M)583 pmf_set (pmf_t res, pmf_t op, ulong M)
584 {
585    for (M++; M > 0; M--)
586       *res++ = *op++;
587 }
588 
589 
590 /*
591    op := Y^r * op
592 */
593 #define pmf_rotate \
594     ZNP_pmf_rotate
595 ZNP_INLINE void
pmf_rotate(pmf_t op,ulong r)596 pmf_rotate (pmf_t op, ulong r)
597 {
598    op[0] += r;
599 }
600 
601 
602 /*
603    op1 := op2 + op1
604    op2 := op2 - op1
605 
606    Inputs must be [0, m); outputs will be in [0, m).
607 */
608 #define pmf_bfly \
609     ZNP_pmf_bfly
610 void
611 pmf_bfly (pmf_t op1, pmf_t op2, ulong M, const zn_mod_t mod);
612 
613 /*
614    op1 := op1 + op2
615 
616    Inputs must be [0, m); outputs will be in [0, m).
617 */
618 #define pmf_add \
619     ZNP_pmf_add
620 void
621 pmf_add (pmf_t op1, const pmf_t op2, ulong M, const zn_mod_t mod);
622 
623 /*
624    op1 := op1 - op2
625 
626    Inputs must be [0, m); outputs will be in [0, m).
627 */
628 #define pmf_sub \
629     ZNP_pmf_sub
630 void
631 pmf_sub (pmf_t op1, const pmf_t op2, ulong M, const zn_mod_t mod);
632 
633 
634 
635 
636 /*
637    These functions are exported just for profiling purposes:
638 */
639 
640 #define zn_array_bfly_inplace \
641     ZNP_zn_array_bfly_inplace
642 void
643 zn_array_bfly_inplace (ulong* op1, ulong* op2, ulong n, const zn_mod_t mod);
644 
645 #define zn_array_add_inplace \
646     ZNP_zn_array_add_inplace
647 void
648 zn_array_add_inplace (ulong* op1, const ulong* op2, ulong n,
649                       const zn_mod_t mod);
650 
651 #define zn_array_sub_inplace \
652     ZNP_zn_array_sub_inplace
653 void
654 zn_array_sub_inplace (ulong* op1, const ulong* op2, ulong n,
655                       const zn_mod_t mod);
656 
657 
658 
659 /* ============================================================================
660 
661      pmfvec_t stuff
662 
663 ============================================================================ */
664 
665 
666 /*
667    A pmfvec_t stores a vector of length K = 2^lgK of elements of S.
668 
669    Used to represent an element of S[Z]/(Z^K + 1) or S[Z]/(Z^K - 1), or some
670    other quotient like that.
671 
672    The functions pmfvec_init/clear should be used to allocate storage for
673    this type. Also sometimes fake ones get created temporarily to point at
674    sub-vectors of existing vectors.
675 */
676 #define pmfvec_struct \
677     ZNP_pmfvec_struct
678 typedef struct
679 {
680    // points to the first coefficient
681    pmf_t data;
682 
683    // number of coefficients
684    ulong K;
685    unsigned lgK;      // lg2(K)
686 
687    // length of coefficients (see definition of pmf_t)
688    ulong M;
689    unsigned lgM;      // lg2(M)
690 
691    // distance between adjacent coefficients, measured in ulongs
692    // (this is at least M + 1, might be more)
693    ptrdiff_t skip;
694 
695    // associated modulus
696    const zn_mod_struct* mod;
697 }
698 pmfvec_struct;
699 
700 #define pmfvec_t \
701     ZNP_pmfvec_t
702 typedef pmfvec_struct  pmfvec_t[1];
703 
704 
705 /*
706    Checks that vec1 and vec2 have compatible data,
707    i.e. have the same K, M, mod.
708 */
709 #define pmfvec_compatible \
710     ZNP_pmfvec_compatible
711 ZNP_INLINE int
pmfvec_compatible(const pmfvec_t vec1,const pmfvec_t vec2)712 pmfvec_compatible (const pmfvec_t vec1, const pmfvec_t vec2)
713 {
714    return (vec1->K == vec2->K) && (vec1->M == vec2->M) &&
715           (vec1->mod == vec2->mod);
716 }
717 
718 
719 /*
720    Initialises res with given parameters, allocates memory.
721 */
722 #define pmfvec_init \
723     ZNP_pmfvec_init
724 void
725 pmfvec_init (pmfvec_t res, unsigned lgK, ptrdiff_t skip, unsigned lgM,
726              const zn_mod_t mod);
727 
728 
729 /*
730    Initialises res in preparation for a Nussbaumer multiplication of
731    length 2^lgL.
732 */
733 #define pmfvec_init_nuss \
734     ZNP_pmfvec_init_nuss
735 void
736 pmfvec_init_nuss (pmfvec_t res, unsigned lgL, const zn_mod_t mod);
737 
738 
739 /*
740    Destroys op, frees all associated memory.
741 */
742 #define pmfvec_clear \
743     ZNP_pmfvec_clear
744 void
745 pmfvec_clear (pmfvec_t op);
746 
747 
748 /*
749    res := op
750 */
751 #define pmfvec_set \
752     ZNP_pmfvec_set
753 void
754 pmfvec_set (pmfvec_t res, const pmfvec_t op);
755 
756 
757 /*
758    Multiplies first n coefficients of op by x.
759 */
760 #define pmfvec_scalar_mul \
761     ZNP_pmfvec_scalar_mul
762 void
763 pmfvec_scalar_mul (pmfvec_t op, ulong n, ulong x);
764 
765 
766 /*
767    Multiplies pointwise the first n coefficients of op1 and op2, puts result
768    in res.
769 
770    It's okay for res to alias op1 or op2. The modulus must be odd.
771 
772    If the special_first_two flag is set, the routine assumes that the first
773    two coefficients are of length only M/2 (this is the typical situation
774    after performing the FFT), and multiplies them more quickly accordingly.
775 
776    The routine automatically selects KS or Nussbaumer multiplication
777    depending on the modulus bitsize and on M.
778 
779    The output will be divided by a fudge factor, which can be retrieved
780    via pmfvec_mul_fudge().
781 
782    Automatically uses specialised squaring algorithm if the inputs are the
783    same pmfvec_t object.
784 */
785 #define pmfvec_mul \
786     ZNP_pmfvec_mul
787 void
788 pmfvec_mul (pmfvec_t res, const pmfvec_t op1, const pmfvec_t op2, ulong n,
789             int special_first_two);
790 
791 #define pmfvec_mul_fudge \
792     ZNP_pmfvec_mul_fudge
793 ulong
794 pmfvec_mul_fudge (unsigned lgM, int sqr, const zn_mod_t mod);
795 
796 
797 /*
798    Modifies the op->data and op->skip to make it look as if the first
799    coefficient is the one at index n - 1, and the last coefficient is
800    the one at index 0. Calling this function again undoes the reversal.
801    Note that this function *must* be called a second time before calling
802    pmfvec_clear(), so that free() is not called on the wrong pointer!
803 */
804 #define pmfvec_reverse \
805     ZNP_pmfvec_reverse
806 void
807 pmfvec_reverse (pmfvec_t op, ulong n);
808 
809 
810 
811 /* ============================================================================
812 
813      stuff in pmfvec_fft.c
814 
815 ============================================================================ */
816 
817 /*
818    ---------- forward FFTs ----------
819 
820    The functions
821 
822       pmfvec_fft()
823       pmfvec_fft_basecase()
824       pmfvec_fft_dc()
825       pmfvec_fft_huge()
826 
827    operate on a pmfvec_t, and compute inplace the FFT:
828 
829       b_k = Y^{t k'} \sum_{i=0}^{K-1} Y^{2 M i k' / K} a_i,
830 
831    where 0 <= t < 2M / K is a twist parameter. The notation k' indicates the
832    length lgK bit-reversal of k.
833 
834    All except the "basecase" version have an n parameter; they only compute the
835    first n outputs. The remaining buffers are used in intermediate steps, and
836    contain junk at the end. For "basecase", all K outputs are computed.
837 
838    All except the "basecase" version have a z parameter; they assume that the
839    input coefficients are zero from index z and beyond. They never read from
840    those coefficients. For "basecase", all the inputs are used.
841 
842    The four versions use different algorithms as follows:
843 
844       * pmfvec_fft(): main entry point, delegates to one of the other routines
845         based on the size of the transform.
846 
847       * pmfvec_fft_basecase(): low-overhead iterative FFT, no truncation logic.
848 
849       * pmfvec_fft_dc(): divide-and-conquer. It handles the top layer of
850         butterflies, and then recurses into the two halves. This is intended
851         for fairly small transforms where locality is not a big issue. The
852         algorithm implemented here is essentially van der Hoeven's "truncated
853         Fourier transform" [vdH04], [vdH05].
854 
855       * pmfvec_fft_huge(): intended for large transforms, where locality
856         is an issue. It factors the FFT into U = 2^lgU transforms of length
857         T = 2^lgT followed by T transforms of length U, where K = T * U. This
858         is done recursively until we're in L1 cache (or as small as possible),
859         at which point we switch to pmfvec_fft_dc().
860 
861         The algorithm is straightforward, but I believe it to be new. It is
862         simultaneously a generalisation of van der Hoeven's truncated Fourier
863         transform and Bailey's FFT algorithm [Bai89]. (I used something
864         similar in the ZmodF_poly module in FLINT.)
865 
866 
867    ---------- inverse FFTs ----------
868 
869    The functions
870 
871       pmfvec_ifft()
872       pmfvec_ifft_basecase()
873       pmfvec_ifft_dc()
874       pmfvec_ifft_huge()
875 
876    compute the corresponding inverse FFT. They are a little more complicated
877    than the FFTs owing to the truncation.
878 
879    Let a_i and b_k be as described above for the FFTs. The IFFT functions
880    take as input the array
881 
882       b_{0'}, b_{1'}, ..., b_{(n-1)'}, K*a_n, K*a_{n+1}, ..., K*a_{K-1},
883 
884    for some 0 <= n <= K. If the fwd flag is zero, then the output of the IFFT
885    routine is:
886 
887       K*a_0, K*a_1, ..., K*a_{n-1}
888 
889    followed by K - n junk coefficients. If the fwd flag is set, then we
890    require that 0 <= n <= K - 1, and the output is
891 
892       K*a_0, K*a_1, ..., K*a_{n-1}, b_n
893 
894    followed by K - n - 1 junk coefficients; i.e. it also computes one
895    coefficient of the *forward* FFT.
896 
897    The "basecase" version has no n parameter, and assumes that n = K. Here
898    the routine becomes the (non-truncated) IFFT as usually understood, with
899    inputs in bit-reversed order and outputs in normal order.
900 
901    All except the "basecase" version have a z parameter (with z >= n); they
902    assume that the input coefficients are zero from index z and beyond. They
903    never read from those coefficients. For the "basecase" version, all of the
904    inputs are used.
905 
906    The four versions use different algorithms as follows:
907 
908       * pmfvec_ifft(): main entry point, delegates to one of the other routines
909         based on the size of the transform.
910 
911       * pmfvec_ifft_basecase(): low-overhead iterative IFFT, no truncation
912         logic.
913 
914       * pmfvec_ifft_dc(): divide-and-conquer. It recurses into the two halves,
915         and handles the top layer of butterflies. This is intended for fairly
916         small transforms where locality is not a big issue. The algorithm
917         implemented here is essentially van der Hoeven's beautiful "truncated
918         inverse Fourier transform".
919 
920       * pmfvec_ifft_huge(): intended for large transforms, where locality
921         is an issue. It factors the FFT into U = 2^lgU transforms of length
922         T = 2^lgT and T transforms of length U, where K = T * U. This is done
923         recursively until we're in L1 cache (if possible), at which point we
924         switch to pmfvec_ifft_dc().
925 
926         The algorithm is not as simple as the FFT version; it is necessary
927         to alternate between "row" and "column" transforms in a slightly
928         complicated way. I believe the algorithm to be new.
929 
930 
931    ---------- transposed forward and inverse FFTs ----------
932 
933    The functions
934 
935       pmfvec_tpfft_basecase()
936       pmfvec_tpfft_dc()
937       pmfvec_tpfft_huge()
938       pmfvec_tpfft()
939 
940    are *transposed* versions of the corresponding pmfvec_fft() routines. This
941    means that if the FFT computes an S-linear map from S^z to S^n, the
942    transposed version computes the transpose map from S^n to S^z.
943 
944    Similarly, the functions
945 
946       pmfvec_tpifft_basecase()
947       pmfvec_tpifft_dc()
948       pmfvec_tpifft_huge()
949       pmfvec_tpifft()
950 
951    are transposed versions of the IFFT routines. If the IFFT computes an
952    S-linear map from S^z to S^(n + fwd), the transposed version computes the
953    transpose map from S^(n + fwd) to S^z.
954 
955    The algorithms are transposed essentially by reversing them, and
956    transposing every step of the algorithm; see for example [BLS03] for how
957    this is done. We don't have comments in these routines; see the comments
958    on the corresponding FFT/IFFT routines.
959 
960 
961    ---------- notes for all the above functions ----------
962 
963    These functions all perform O(n * lgK + K) operations in S (an "operation"
964    being an addition/subtraction/copy with possibly an implied rotation by a
965    power of Y). In particular the running time varies fairly smoothly with
966    n instead of jumping with K.
967 
968    For all four algorithms, the "dc" version is essentially equivalent to the
969    "huge" version with lgT = 1.
970 
971    These functions are not thread-safe. Apart from modifying the input inplace,
972    they also temporarily modify the pmfvec_t structs themselves.
973 
974    Our approach to improving cache performance is certainly not ideal. The
975    main problem is that we can get address conflicts, especially since
976    everything gets spread out by powers of two. Mitigating factors:
977    associativity in the cache; the extra bias word scrambles the addresses
978    somewhat; when the transforms gets large, so do the coefficients, so we
979    don't expect to fit that many in cache anyway.
980 
981 */
982 
983 
984 #define pmfvec_fft \
985     ZNP_pmfvec_fft
986 void
987 pmfvec_fft (pmfvec_t op, ulong n, ulong z, ulong t);
988 
989 #define pmfvec_fft_huge \
990     ZNP_pmfvec_fft_huge
991 void
992 pmfvec_fft_huge (pmfvec_t op, unsigned lgT, ulong n, ulong z, ulong t);
993 
994 #define pmfvec_fft_dc \
995     ZNP_pmfvec_fft_dc
996 void
997 pmfvec_fft_dc (pmfvec_t op, ulong n, ulong z, ulong t);
998 
999 #define pmfvec_fft_basecase \
1000     ZNP_pmfvec_fft_basecase
1001 void
1002 pmfvec_fft_basecase (pmfvec_t op, ulong t);
1003 
1004 #define pmfvec_ifft \
1005     ZNP_pmfvec_ifft
1006 void
1007 pmfvec_ifft (pmfvec_t op, ulong n, int fwd, ulong z, ulong t);
1008 
1009 #define pmfvec_ifft_huge \
1010     ZNP_pmfvec_ifft_huge
1011 void
1012 pmfvec_ifft_huge (pmfvec_t op, unsigned lgT, ulong n, int fwd, ulong z,
1013                   ulong t);
1014 
1015 #define pmfvec_ifft_dc \
1016     ZNP_pmfvec_ifft_dc
1017 void
1018 pmfvec_ifft_dc (pmfvec_t op, ulong n, int fwd, ulong z, ulong t);
1019 
1020 #define pmfvec_ifft_basecase \
1021     ZNP_pmfvec_ifft_basecase
1022 void
1023 pmfvec_ifft_basecase (pmfvec_t op, ulong t);
1024 
1025 #define pmfvec_tpfft \
1026     ZNP_pmfvec_tpfft
1027 void
1028 pmfvec_tpfft (pmfvec_t op, ulong n, ulong z, ulong t);
1029 
1030 #define pmfvec_tpfft_huge \
1031     ZNP_pmfvec_tpfft_huge
1032 void
1033 pmfvec_tpfft_huge (pmfvec_t op, unsigned lgT, ulong n, ulong z, ulong t);
1034 
1035 #define pmfvec_tpfft_dc \
1036     ZNP_pmfvec_tpfft_dc
1037 void
1038 pmfvec_tpfft_dc (pmfvec_t op, ulong n, ulong z, ulong t);
1039 
1040 #define pmfvec_tpfft_basecase \
1041     ZNP_pmfvec_tpfft_basecase
1042 void
1043 pmfvec_tpfft_basecase (pmfvec_t op, ulong t);
1044 
1045 #define pmfvec_tpifft \
1046     ZNP_pmfvec_tpifft
1047 void
1048 pmfvec_tpifft (pmfvec_t op, ulong n, int fwd, ulong z, ulong t);
1049 
1050 #define pmfvec_tpifft_huge \
1051     ZNP_pmfvec_tpifft_huge
1052 void
1053 pmfvec_tpifft_huge (pmfvec_t op, unsigned lgT, ulong n, int fwd, ulong z,
1054                     ulong t);
1055 
1056 #define pmfvec_tpifft_dc \
1057     ZNP_pmfvec_tpifft_dc
1058 void
1059 pmfvec_tpifft_dc (pmfvec_t op, ulong n, int fwd, ulong z, ulong t);
1060 
1061 #define pmfvec_tpifft_basecase \
1062     ZNP_pmfvec_tpifft_basecase
1063 void
1064 pmfvec_tpifft_basecase (pmfvec_t op, ulong t);
1065 
1066 
1067 
1068 /* ============================================================================
1069 
1070      stuff in array.c
1071 
1072 ============================================================================ */
1073 
1074 
1075 /*
1076    Computes
1077 
1078       res = sign1*op1 + sign2*op2,
1079 
1080    where sign1 = -1 if neg1 is set, otherwise +1; ditto for sign2.
1081 
1082    op1 and op2 are arrays of length n.
1083 
1084    res is a staggered array, entries separated by s.
1085 
1086    Return value is res + s*n, i.e. points beyond the written array.
1087 */
1088 #define zn_skip_array_signed_add \
1089     ZNP_zn_skip_array_signed_add
1090 ulong*
1091 zn_skip_array_signed_add (ulong* res, ptrdiff_t skip, size_t n,
1092                           const ulong* op1, int neg1,
1093                           const ulong* op2, int neg2,
1094                           const zn_mod_t mod);
1095 
1096 
1097 /*
1098    Same as zn_array_scalar_mul, but has a _redc_ flag. If the flag is set,
1099    then REDC reduction is used (in which case the modulus must be odd),
1100    otherwise ordinary reduction is used.
1101 */
1102 #define _zn_array_scalar_mul \
1103     ZNP__zn_array_scalar_mul
1104 void
1105 _zn_array_scalar_mul (ulong* res, const ulong* op, size_t n, ulong x,
1106                       int redc, const zn_mod_t mod);
1107 
1108 
1109 /*
1110    Behaves just like zn_array_scalar_mul, except it uses the obvious
1111    optimisation if x == 1.
1112 */
1113 #define zn_array_scalar_mul_or_copy \
1114     ZNP_zn_array_scalar_mul_or_copy
1115 void
1116 zn_array_scalar_mul_or_copy (ulong* res, const ulong* op, size_t n,
1117                              ulong x, const zn_mod_t mod);
1118 
1119 
1120 /* ============================================================================
1121 
1122      stuff in nuss.c
1123 
1124 ============================================================================ */
1125 
1126 
1127 /*
1128    Performs negacyclic multiplication using Nussbaumer's algorithm.
1129 
1130    vec1 and vec2 must be pre-initialised pmfvec_t's with the same
1131    modulus and the same lgM and lgK, satisfying lgM + 1 >= lgK (i.e. there
1132    are enough roots of unity). These are used for scratch space.
1133 
1134    The convolution length is L = 2^lgL, where lgL = lgM + lgK - 1.
1135 
1136    Inputs are op1[0, L) and op2[0, L), output is res[0, L). It's okay for res
1137    to alias op1 or op2.
1138 
1139    If op1 == op2, then a faster squaring version is used. In this case
1140    vec2 is ignored.
1141 
1142    The result comes out divided by a fudge factor, which can be recovered
1143    via nuss_mul_fudge().
1144 */
1145 #define nuss_mul \
1146     ZNP_nuss_mul
1147 void
1148 nuss_mul (ulong* res, const ulong* op1, const ulong* op2,
1149                 pmfvec_t vec1, pmfvec_t vec2);
1150 
1151 #define nuss_mul_fudge \
1152     ZNP_nuss_mul_fudge
1153 ulong
1154 nuss_mul_fudge (unsigned lgL, int sqr, const zn_mod_t mod);
1155 
1156 
1157 /*
1158    Computes optimal lgK and lgM for given lgL, as described above for
1159    nuss_mul().
1160 */
1161 #define nuss_params \
1162     ZNP_nuss_params
1163 void
1164 nuss_params (unsigned* lgK, unsigned* lgM, unsigned lgL);
1165 
1166 
1167 
1168 /* ============================================================================
1169 
1170      stuff from mul_fft.c
1171 
1172 ============================================================================ */
1173 
1174 
1175 /*
1176    Splits op[-k, n) into pieces of length M/2, where M = res->M, and where
1177    the first k coefficients are assumed to be zero. The pieces are written to
1178    the first ceil((n + k) / (M/2)) coefficients of res. The last fractional
1179    piece is treated as if zero-padded up to length M/2. The second half of
1180    each target pmf_t is zeroed out, and the bias fields are all set to b.
1181 
1182    If x != 1, then all entries are multiplied by x mod m.
1183 */
1184 #define fft_split \
1185     ZNP_fft_split
1186 void
1187 fft_split (pmfvec_t res, const ulong* op, size_t n, size_t k, ulong x,
1188            ulong b);
1189 
1190 
1191 /*
1192    Performs the substitution back from S[Z]/(Z^K - 1) to a polynomial in X,
1193    i.e. mapping Y -> X, Z -> X^(M/2). It only looks at the first z coefficients
1194    of op; it assumes the rest are zero. It writes exactly n coefficients of
1195    output. If skip_first is set, it ignores the first M/2 coefficients of
1196    output, and then writes the *next* M/2 coefficients of output.
1197 
1198    NOTE: this routine is not threadsafe: it temporarily modifies the bias
1199    field of the first coefficient of op.
1200 */
1201 #define fft_combine \
1202     ZNP_fft_combine
1203 void
1204 fft_combine (ulong* res, size_t n, const pmfvec_t op, ulong z,
1205              int skip_first);
1206 
1207 
1208 
1209 /*
1210    Same as zn_array_mul(), but uses the Schonhage/Nussbaumer FFT algorithm.
1211 
1212    Uses faster algorithm for squaring if inputs are identical buffers.
1213 
1214    The modulus must be odd.
1215 
1216    Output may overlap the inputs.
1217 
1218    The output will come out divided by a fudge factor, which can be recovered
1219    via zn_array_mul_fft_fudge().
1220 
1221    If x != 1, the output is further multiplied by x.
1222 */
1223 #define zn_array_mul_fft \
1224     ZNP_zn_array_mul_fft
1225 void
1226 zn_array_mul_fft (ulong* res,
1227                   const ulong* op1, size_t n1,
1228                   const ulong* op2, size_t n2,
1229                   ulong x, const zn_mod_t mod);
1230 
1231 #define zn_array_mul_fft_fudge \
1232     ZNP_zn_array_mul_fft_fudge
1233 ulong
1234 zn_array_mul_fft_fudge (size_t n1, size_t n2, int sqr, const zn_mod_t mod);
1235 
1236 
1237 /*
1238    Computes the best lgK, lgM, m1, m2 such that polynomials of length n1 and
1239    n2 may be multiplied with fourier transform parameters lgK and lgM, and
1240    where the polynomials get split into m1 (resp. m2) chunks of length M/2.
1241 
1242    More precisely, the outputs satisfy:
1243 
1244    * lgM + 1 >= lgK (i.e. there are enough roots of unity)
1245    * m1 + m2 - 1 <= K, where m1 = ceil(n1 / (M/2)) and m2 = ceil(n2 / (M/2))
1246      (i.e. the transform has enough room for the answer)
1247    * lgM >= 1
1248    * lgM is minimal subject to the above conditions.
1249 */
1250 #define mul_fft_params \
1251     ZNP_mul_fft_params
1252 void
1253 mul_fft_params (unsigned* lgK, unsigned* lgM, ulong* m1, ulong* m2,
1254                 size_t n1, size_t n2);
1255 
1256 
1257 /*
1258    Same as zn_array_mulmid(), but uses the Schonhage/Nussbaumer FFT algorithm.
1259 
1260    The modulus must be odd.
1261 
1262    Output may overlap the inputs.
1263 
1264    The output will come out divided by a fudge factor, which can be recovered
1265    via zn_array_mulmid_fft_fudge().
1266 
1267    If x != 1, the output is further multiplied by x.
1268 */
1269 #define zn_array_mulmid_fft \
1270     ZNP_zn_array_mulmid_fft
1271 void
1272 zn_array_mulmid_fft (ulong* res,
1273                      const ulong* op1, size_t n1,
1274                      const ulong* op2, size_t n2,
1275                      ulong x, const zn_mod_t mod);
1276 
1277 #define zn_array_mulmid_fft_fudge \
1278     ZNP_zn_array_mulmid_fft_fudge
1279 ulong
1280 zn_array_mulmid_fft_fudge (size_t n1, size_t n2, const zn_mod_t mod);
1281 
1282 
1283 /*
1284    Computes the best lgK, lgM, m1, m2, p such that the middle product of
1285    polynomials of length n1 and n2 may be computed with fourier transform
1286    parameters lgK and lgM, where the first polynomial is padded on the left by
1287    p zeroes, and where the polynomials get split into m1 (resp. m2) chunks of
1288    length M/2.
1289 
1290    More precisely, the outputs satisfy (see mul_fft.c for further discussion):
1291 
1292    * lgM >= 1
1293    * lgM + 1 >= lgK (i.e. there are enough roots of unity)
1294    * 1 <= p <= M/2  and  n2 + p - 1 is divisible by M/2
1295    * m1 = ceil((n1 + p) / (M/2))
1296    * m2 = ceil(n2 / (M/2))
1297    * m1 <= K
1298    * lgM is minimal subject to the above conditions.
1299 */
1300 #define mulmid_fft_params \
1301     ZNP_mulmid_fft_params
1302 void
1303 mulmid_fft_params (unsigned* lgK, unsigned* lgM, ulong* m1, ulong* m2,
1304                    ulong* p, size_t n1, size_t n2);
1305 
1306 
1307 /*
1308    Stores precomputed information for performing an FFT-based middle product
1309    where the first input array is invariant, and the length of the second
1310    input array is invariant.
1311 */
1312 #define zn_array_mulmid_fft_precomp1_struct \
1313     ZNP_zn_array_mulmid_fft_precomp1_struct
1314 struct zn_array_mulmid_fft_precomp1_struct
1315 {
1316    // these parameters are as described at the top of mul_fft.c.
1317    size_t n1, n2;
1318    ulong m1, m2, p;
1319 
1320    // holds the transposed IFFT of the input array
1321    pmfvec_t vec1;
1322 };
1323 
1324 #define zn_array_mulmid_fft_precomp1_t \
1325     ZNP_zn_array_mulmid_fft_precomp1_t
1326 typedef struct zn_array_mulmid_fft_precomp1_struct
1327                zn_array_mulmid_fft_precomp1_t[1];
1328 
1329 
1330 /*
1331    Initialises res to perform middle product of op1[0, n1) by operands of
1332    size n2.
1333 
1334    If x != 1, the data is multiplied by x. Since middle products are linear,
1335    this has the effect of multiplying the output of subsequent calls to
1336    zn_array_mulmid_fft_precomp1_execute() by x.
1337 */
1338 #define zn_array_mulmid_fft_precomp1_init \
1339     ZNP_zn_array_mulmid_fft_precomp1_init
1340 void
1341 zn_array_mulmid_fft_precomp1_init (zn_array_mulmid_fft_precomp1_t res,
1342                                    const ulong* op1, size_t n1, size_t n2,
1343                                    ulong x, const zn_mod_t mod);
1344 
1345 /*
1346    Performs middle product of op1[0, n1) by op2[0, n2), stores result at
1347    res[0, n1 - n2 + 1).
1348 
1349    The output will come out divided by a fudge factor, which can be recovered
1350    via zn_array_mulmid_fft_precomp1_fudge().
1351 
1352    If x != 1, the output is further multiplied by x.
1353 */
1354 #define zn_array_mulmid_fft_precomp1_execute \
1355     ZNP_zn_array_mulmid_fft_precomp1_execute
1356 void
1357 zn_array_mulmid_fft_precomp1_execute
1358                      (ulong* res, const ulong* op2, ulong x,
1359                       const zn_array_mulmid_fft_precomp1_t precomp);
1360 
1361 #define zn_array_mulmid_fft_precomp1_fudge \
1362     ZNP_zn_array_mulmid_fft_precomp1_fudge
1363 ulong
1364 zn_array_mulmid_fft_precomp1_fudge (size_t n1, size_t n2, const zn_mod_t mod);
1365 
1366 
1367 /*
1368    Deallocates op.
1369 */
1370 #define zn_array_mulmid_fft_precomp1_clear \
1371     ZNP_zn_array_mulmid_fft_precomp1_clear
1372 void
1373 zn_array_mulmid_fft_precomp1_clear (zn_array_mulmid_fft_precomp1_t op);
1374 
1375 
1376 
1377 /* ============================================================================
1378 
1379      stuff from mpn_mulmid.c
1380 
1381 ============================================================================ */
1382 
1383 
1384 /*
1385    Let n1 >= n2 >= 1, and let
1386 
1387       a = \sum_{i=0}^{n1-1} a_i B^i
1388       b = \sum_{j=0}^{n2-1} b_j B^j
1389 
1390    be integers with n1 and n2 limbs respectively. We define SMP(a, b), the
1391    *simple* middle product of a and b, to be the integer
1392 
1393       \sum_{0 <= i < n1, 0 <= j < n2, n2-1 <= i+j < n1} a_i b_j B^(i+j-(n2-1)).
1394 
1395    In other words, it's as if we treat a and b as polynomials in Z[B] of
1396    length n1 and n2 respectively, compute the polynomial middle product over
1397    Z, and then propagate the high words and subsequent carries.
1398 
1399    Note that SMP(a, b) is at most n1 - n2 + 3 limbs long (we assume throughout
1400    that n1 is less than the maximum value stored in a limb).
1401 */
1402 
1403 
1404 /*
1405    Computes SMP(op1[0, n1), op2[0, n2)).
1406 
1407    Stores result at res[0, n1 - n2 + 3).
1408 */
1409 void
1410 ZNP_mpn_smp (mp_limb_t* res,
1411              const mp_limb_t* op1, size_t n1,
1412              const mp_limb_t* op2, size_t n2);
1413 
1414 
1415 /*
1416    Same as mpn_smp(), but always uses basecase (quadratic-time) algorithm.
1417 
1418    res[0, n1 - n2 + 3) must not overlap op1[0, n1) or op2[0, n2).
1419 */
1420 void
1421 ZNP_mpn_smp_basecase (mp_limb_t* res,
1422                       const mp_limb_t* op1, size_t n1,
1423                       const mp_limb_t* op2, size_t n2);
1424 
1425 
1426 /*
1427    Computes SMP(op1[0, 2*n - 1), op2[0, n)). Algorithm is selected depending
1428    on size of n.
1429 
1430    Stores result at res[0, n + 2). Output must not overlap inputs.
1431 */
1432 void
1433 ZNP_mpn_smp_n (mp_limb_t* res, const mp_limb_t* op1,
1434                const mp_limb_t* op2, size_t n);
1435 
1436 
1437 /*
1438    Computes SMP(op1[0, 2*n - 1), op2[0, n)), using Karatsuba algorithm.
1439 
1440    Must have n >= 2.
1441 
1442    Stores result at res[0, n + 2). Output must not overlap inputs.
1443 */
1444 void
1445 ZNP_mpn_smp_kara (mp_limb_t* res, const mp_limb_t* op1, const mp_limb_t* op2,
1446                   size_t n);
1447 
1448 
1449 
1450 /*
1451    Computes the *true* middle product of op1[0, n1) and op2[0, n2).
1452 
1453    More precisely, let P be the product op1 * op2. This function computes
1454    P[n2 + 1, n1), and stores this at res[2, n1 - n2 + 1).
1455 
1456    Must have n1 >= n2 >= 1.
1457 
1458    The output buffer res *must* have room for n1 - n2 + 3 limbs, but the first
1459    two limbs and the last two limbs of the output will be *garbage*. In
1460    particular, only n1 - n2 - 1 limbs of useful output are produced. If
1461    n1 <= n2 + 1, then no useful output is produced.
1462 */
1463 void
1464 ZNP_mpn_mulmid (mp_limb_t* res,
1465                 const mp_limb_t* op1, size_t n1,
1466                 const mp_limb_t* op2, size_t n2);
1467 
1468 
1469 /*
1470    Same as mpn_mulmid, but always just falls back on using mpn_mul.
1471 */
1472 void
1473 ZNP_mpn_mulmid_fallback (mp_limb_t* res,
1474                          const mp_limb_t* op1, size_t n1,
1475                          const mp_limb_t* op2, size_t n2);
1476 
1477 
1478 
1479 
1480 /* ============================================================================
1481 
1482      stuff from mulmid.c
1483 
1484 ============================================================================ */
1485 
1486 
1487 /*
1488    Same as zn_array_mulmid(), but always falls back on doing the full product
1489    via _zn_array_mul().
1490 
1491    If fastred is cleared, the output is the same as for zn_array_mulmid().
1492 
1493    If fastred is set, the routine uses the fastest modular reduction strategy
1494    available for the given parameters. The result will come out divided by a
1495    fudge factor, which can be recovered via
1496    _zn_array_mulmid_fallback_fudge().
1497 */
1498 #define zn_array_mulmid_fallback \
1499     ZNP_zn_array_mulmid_fallback
1500 void
1501 zn_array_mulmid_fallback (ulong* res,
1502                           const ulong* op1, size_t n1,
1503                           const ulong* op2, size_t n2,
1504                           int fastred, const zn_mod_t mod);
1505 
1506 #define zn_array_mulmid_fallback_fudge \
1507     ZNP_zn_array_mulmid_fallback_fudge
1508 ulong
1509 zn_array_mulmid_fallback_fudge (size_t n1, size_t n2, const zn_mod_t mod);
1510 
1511 
1512 /*
1513    Identical to zn_array_mulmid(), except for the fastred flag.
1514 
1515    If fastred is cleared, the output is the same as for zn_array_mulmid().
1516 
1517    If fastred is set, the routine uses the fastest modular reduction strategy
1518    available for the given parameters. The result will come out divided by a
1519    fudge factor, which can be recovered via _zn_array_mulmid_fudge().
1520 */
1521 #define _zn_array_mulmid \
1522     ZNP__zn_array_mulmid
1523 void
1524 _zn_array_mulmid (ulong* res,
1525                   const ulong* op1, size_t n1,
1526                   const ulong* op2, size_t n2,
1527                   int fastred, const zn_mod_t mod);
1528 
1529 #define _zn_array_mulmid_fudge \
1530     ZNP__zn_array_mulmid_fudge
1531 ulong
1532 _zn_array_mulmid_fudge (size_t n1, size_t n2, const zn_mod_t mod);
1533 
1534 
1535 /* ============================================================================
1536 
1537      other random stuff
1538 
1539 ============================================================================ */
1540 
1541 
1542 /*
1543    Compute 2^k mod m.
1544 
1545    Modulus must be odd.
1546 
1547    Must have -ULONG_BITS < k < ULONG_BITS.
1548 */
1549 #define zn_mod_pow2 \
1550     ZNP_zn_mod_pow2
1551 ulong
1552 zn_mod_pow2 (int k, const zn_mod_t mod);
1553 
1554 
1555 
1556 /*
1557    Constants describing algorithms for precomputed middle products
1558 */
1559 #define ZNP_MULMID_ALGO_FALLBACK  0
1560 #define ZNP_MULMID_ALGO_KS        1
1561 #define ZNP_MULMID_ALGO_FFT       2
1562 
1563 
1564 
1565 #ifdef __cplusplus
1566 }
1567 #endif
1568 
1569 #endif
1570 
1571 // end of file ****************************************************************
1572