1 /*
2 zn_poly_internal.h: main header file #included internally by zn_poly
3 modules
4
5 Copyright (C) 2007, 2008, David Harvey
6
7 This file is part of the zn_poly library (version 0.9).
8
9 This program is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 2 of the License, or
12 (at your option) version 3 of the License.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
21
22 */
23
24
25 /*
26
27 IMPORTANT NOTE!!!!!!
28
29 Everything in this file is internal, and may change incompatibly in
30 future versions of zn_poly. You have been warned!
31
32 */
33
34
35 #ifndef ZN_POLY_INTERNAL_H
36 #define ZN_POLY_INTERNAL_H
37
38 #include <gmp.h>
39 #include <stddef.h>
40 #include <stdio.h>
41 #include "zn_poly.h"
42
43
44 #ifdef __cplusplus
45 extern "C" {
46 #endif
47
48
49
50 #ifdef ZNP_USE_FLINT
51
52 // use FLINT integer multiplication
53 #include <FLINT/mpn_extras.h>
54 #define ZNP_mpn_mul F_mpn_mul
55
56 #else
57
58 // use GMP integer multiplication
59 #define ZNP_mpn_mul mpn_mul
60
61 #endif
62
63
64
65 /*
66 Executes "stuff", and checks that it returns 0. Usage is e.g.:
67
68 ZNP_ASSERT_NOCARRY (mpn_add_n (z, x, y, 42));
69
70 This is used where we want the test suite to check that the return value
71 is zero, but the production version should skip the check, and we don't
72 want to stuff around with temporaries everywhere.
73 */
74 #define ZNP_ASSERT_NOCARRY(stuff) \
75 do { \
76 mp_limb_t __xyz_cy; \
77 __xyz_cy = (stuff); \
78 ZNP_ASSERT (__xyz_cy == 0); \
79 } while (0)
80
81
82 /*
83 For integers a >= 1 and b >= 1, returns ceil(a / b).
84 */
85 #define CEIL_DIV(a, b) ((((a) - 1) / (b)) + 1)
86
87
88 /*
89 For integers a >= 1 and 0 <= r < ULONG BITS, returns ceil(a / 2^r).
90 */
91 #define CEIL_DIV_2EXP(a, r) ((((a) - 1) >> (r)) + 1)
92
93
94 #define ZNP_MIN(aaa, bbb) (((aaa) < (bbb)) ? (aaa) : (bbb))
95 #define ZNP_MAX(aaa, bbb) (((aaa) > (bbb)) ? (aaa) : (bbb))
96
97
98 /*
99 Estimate of the L1 cache size, in bytes.
100 If this is a bit on the small side, it's probably not a big deal.
101 If it's on the big side, that might start to seriously degrade performance.
102 */
103 #define ZNP_CACHE_SIZE 32768
104
105
106 /*
107 Returns ceil(log2(x)).
108 x must be >= 1.
109 */
110 #define ceil_lg \
111 ZNP_ceil_lg
112 int ceil_lg (ulong x);
113
114
115 /*
116 Returns floor(log2(x)).
117 Returns -1 for x == 0.
118 */
119 #define floor_lg \
120 ZNP_floor_lg
121 int
122 floor_lg (ulong x);
123
124
125 /*
126 res := abs(op1 - op2).
127
128 Returns 1 if op1 - op2 is negative, else zero.
129 */
130 #define signed_mpn_sub_n \
131 ZNP_signed_mpn_sub_n
132 ZNP_INLINE int
signed_mpn_sub_n(mp_limb_t * res,const mp_limb_t * op1,const mp_limb_t * op2,size_t n)133 signed_mpn_sub_n (mp_limb_t* res, const mp_limb_t* op1, const mp_limb_t* op2,
134 size_t n)
135 {
136 if (mpn_cmp (op1, op2, n) >= 0)
137 {
138 mpn_sub_n (res, op1, op2, n);
139 return 0;
140 }
141 else
142 {
143 mpn_sub_n (res, op2, op1, n);
144 return 1;
145 }
146 }
147
148
149 /*
150 The ZNP_FASTALLOC and ZNP_FASTFREE macros are used for allocating memory
151 which is taken off the stack if the request is small enough, or off the
152 heap if not.
153
154 Example usage:
155
156 ZNP_FASTALLOC (stuff, int, 100, n);
157
158 This does two things. It allocates an array of 100 ints on the stack.
159 It also declares a pointer "int* stuff", which points to a block of ints
160 of length n. If n <= 100, the block will be the one just allocated on the
161 stack. If n > 100, the block will be found using malloc.
162
163 Then afterwards, you need to do:
164
165 ZNP_FASTFREE (stuff);
166
167 This will call free() if the block was originally taken off the heap.
168 */
169
170 #define ZNP_FASTALLOC(ptr, type, reserve, request) \
171 size_t __FASTALLOC_request_##ptr = (request); \
172 type* ptr; \
173 type __FASTALLOC_##ptr [reserve]; \
174 if (__FASTALLOC_request_##ptr <= (reserve)) \
175 ptr = __FASTALLOC_##ptr; \
176 else \
177 ptr = (type*) malloc (sizeof (type) * __FASTALLOC_request_##ptr);
178
179
180 #define ZNP_FASTFREE(ptr) \
181 if (ptr != __FASTALLOC_##ptr) \
182 free (ptr);
183
184
185
186 extern size_t
187 ZNP_mpn_smp_kara_thresh;
188
189 extern size_t
190 ZNP_mpn_mulmid_fallback_thresh;
191
192
193 /*
194 Stores tuning data for moduli of a specific bitsize.
195 */
196 #define tuning_info_t \
197 ZNP_tuning_info_t
198 typedef struct
199 {
200 // thresholds for array multiplication
201 size_t mul_KS2_thresh; // KS1 -> KS2 threshold
202 size_t mul_KS4_thresh; // KS2 -> KS4 threshold
203 size_t mul_fft_thresh; // KS4 -> fft threshold
204
205 // as above, but for squaring
206 size_t sqr_KS2_thresh;
207 size_t sqr_KS4_thresh;
208 size_t sqr_fft_thresh;
209
210 // as above, but for middle products
211 size_t mulmid_KS2_thresh;
212 size_t mulmid_KS4_thresh;
213 size_t mulmid_fft_thresh;
214
215 // for negacyclic multiplications, switch from KS to Nussbaumer FFT
216 // when length reaches 2^nuss_mul_thresh
217 unsigned nuss_mul_thresh;
218 // ditto for nussbaumer squaring
219 unsigned nuss_sqr_thresh;
220
221 }
222 tuning_info_t;
223
224
225 /*
226 Global array of tuning_info_t's, one for each bitsize.
227 */
228 #define tuning_info \
229 ZNP_tuning_info
230 extern tuning_info_t tuning_info[];
231
232
233
234 /* ============================================================================
235
236 stuff from pack.c
237
238 ============================================================================ */
239
240 /*
241 Computes
242
243 res := 2^k * ( op[0] + op[s]*2^b + ... + op[(n-1)*s]*2^((n-1)*b) ).
244
245 Assumes each op[i] satisfies 0 <= op[i] < 2^b.
246
247 Must have 0 < b < 3 * ULONG_BITS.
248
249 If r == 0, then exactly
250 ceil((k + n * b) / GMP_NUMB_BITS)
251 limbs are written. Otherwise, the output will be zero-padded up to exactly
252 r limbs, which must be at least the above number of limbs.
253
254 */
255 #define zn_array_pack \
256 ZNP_zn_array_pack
257 void
258 zn_array_pack (mp_limb_t* res, const ulong* op, size_t n, ptrdiff_t s,
259 unsigned b, unsigned k, size_t r);
260
261
262 /*
263 Let op be an integer of the form
264
265 2^k * (a[0] + a[1]*2^b + ... + a[n-1]*2^((n-1)*b)) + junk,
266
267 where 0 <= a[i] < 2^b for each i, and where 0 <= junk < 2^k.
268
269 This function reads off the a[i]'s and stores them at res. Each output
270 coefficient occupies exactly ceil(b / ULONG_BITS) words. The input
271 should be exactly ceil((k + n * b) / GMP_NUMB_BITS) limbs long.
272
273 Must have 0 < b < 3 * ULONG_BITS.
274 */
275 #define zn_array_unpack \
276 ZNP_zn_array_unpack
277 void
278 zn_array_unpack (ulong* res, const mp_limb_t* op, size_t n, unsigned b,
279 unsigned k);
280
281
282 /*
283 Same as zn_array_unpack, but adds an assertion to check that the unpacking
284 routine will not read beyond the first r limbs of op.
285 */
286 #define zn_array_unpack_SAFE(res, op, n, b, k, r) \
287 do \
288 { \
289 ZNP_ASSERT((n) * (b) + (k) <= (r) * GMP_NUMB_BITS); \
290 zn_array_unpack(res, op, n, b, k); \
291 } while (0)
292
293
294
295 /* ============================================================================
296
297 stuff from mul.c
298
299 ============================================================================ */
300
301 /*
302 Identical to zn_array_mul(), except for the fastred flag.
303
304 If fastred is cleared, the output is the same as for zn_array_mul().
305
306 If fastred is set, the routine uses the fastest modular reduction strategy
307 available for the given parameters. The result will come out divided by a
308 fudge factor, which can be recovered via _zn_array_mul_fudge().
309 */
310 #define _zn_array_mul \
311 ZNP__zn_array_mul
312 void
313 _zn_array_mul (ulong* res,
314 const ulong* op1, size_t n1,
315 const ulong* op2, size_t n2,
316 int fastred, const zn_mod_t mod);
317
318 #define _zn_array_mul_fudge \
319 ZNP__zn_array_mul_fudge
320 ulong
321 _zn_array_mul_fudge (size_t n1, size_t n2, int sqr, const zn_mod_t mod);
322
323
324
325 /* ============================================================================
326
327 stuff from ks_support.c
328
329 ============================================================================ */
330
331
332 /*
333 Sets res[i * s] = reduction modulo mod of the i-th entry of op,
334 for 0 <= i < n. Each entry of op is w ulongs.
335
336 If the redc flag is set, the results are divided by -B mod m
337 (only allowed if the modulus is odd).
338
339 Must have 1 <= w <= 3.
340 */
341 #define array_reduce \
342 ZNP_array_reduce
343 void
344 array_reduce (ulong* res, ptrdiff_t s, const ulong* op, size_t n, unsigned w,
345 int redc, const zn_mod_t mod);
346
347
348
349 /*
350 This is a helper function for the variants of KS that evaluate at
351 "reciprocal" evaluation points like 2^(-N); it implements essentially the
352 algorithm of section 3.2 of [Har07], plus reductions mod n.
353
354 It accepts two integers X and Y written in base M = 2^b, where
355 1 <= b <= 3 * ULONG_BITS / 2. It assumes that
356
357 X = a[0] + a[1]*M + ... + a[n-1]*M^(n-1),
358 Y = a[n-1] + a[n-2]*M + ... + a[0]*M^(n-1),
359
360 where each a[i] is two "digits" long, and where the high digit of a[i]
361 is at most M-2 (i.e. may not equal M-1). It reconstructs the a[i],
362 reduces them mod m, and stores the results in an array.
363
364 The input is supplied as follows. X is in op1, Y is in op2. They are both
365 arrays of values that are b bits wide, where b <= 3 * ULONG_BITS / 2.
366 Each value takes up one ulong if b <= ULONG_BITS, otherwise two ulongs.
367 There are n + 1 such values in each array (i.e. each array consists of
368 (n + 1) * ceil(b / ULONG_BITS) ulongs).
369
370 The output (n ulongs) is written to the array res, with consecutive
371 outputs separated by s ulongs.
372
373 mod describes the modulus m.
374
375 If the redc flag is set, the modular reductions are performed using REDC,
376 i.e. the result contain an extra factor of -1/B mod m (where
377 B = 2^ULONG_BITS).
378 */
379
380 #define zn_array_recover_reduce \
381 ZNP_zn_array_recover_reduce
382 void
383 zn_array_recover_reduce (ulong* res, ptrdiff_t s, const ulong* op1,
384 const ulong* op2, size_t n, unsigned b, int redc,
385 const zn_mod_t mod);
386
387
388
389 /* ============================================================================
390
391 stuff from mul_ks.c
392
393 ============================================================================ */
394
395 /*
396 These are the same as zn_array_mul(). They use four different types of
397 Kronecker substitution. They automatically use a faster algorithm for
398 squaring (if the inputs are identical buffers).
399
400 Aliasing of all operands allowed.
401
402 Must have n1 >= n2 >= 1.
403
404 If the redc flag is set, the outputs will be divided by -B mod m.
405 (Only allowed if the modulus is odd.)
406 */
407 #define zn_array_mul_KS1 \
408 ZNP_zn_array_mul_KS1
409 void
410 zn_array_mul_KS1 (ulong* res,
411 const ulong* op1, size_t n1,
412 const ulong* op2, size_t n2,
413 int redc, const zn_mod_t mod);
414
415 #define zn_array_mul_KS2 \
416 ZNP_zn_array_mul_KS2
417 void
418 zn_array_mul_KS2 (ulong* res,
419 const ulong* op1, size_t n1,
420 const ulong* op2, size_t n2,
421 int redc, const zn_mod_t mod);
422
423 #define zn_array_mul_KS3 \
424 ZNP_zn_array_mul_KS3
425 void
426 zn_array_mul_KS3 (ulong* res,
427 const ulong* op1, size_t n1,
428 const ulong* op2, size_t n2,
429 int redc, const zn_mod_t mod);
430
431 #define zn_array_mul_KS4 \
432 ZNP_zn_array_mul_KS4
433 void
434 zn_array_mul_KS4 (ulong* res,
435 const ulong* op1, size_t n1,
436 const ulong* op2, size_t n2,
437 int redc, const zn_mod_t mod);
438
439
440 /* ============================================================================
441
442 stuff from mulmid_ks.c
443
444 ============================================================================ */
445
446
447 /*
448 These are the same as zn_array_mulmid(). They use four different types of
449 Kronecker substitution.
450
451 Aliasing of all operands allowed.
452
453 Must have n1 >= n2 >= 1.
454
455 If the redc flag is set, the outputs will be divided by -B mod m.
456 (Only allowed if the modulus is odd.)
457 */
458 #define zn_array_mulmid_KS1 \
459 ZNP_zn_array_mulmid_KS1
460 void
461 zn_array_mulmid_KS1 (ulong* res,
462 const ulong* op1, size_t n1,
463 const ulong* op2, size_t n2,
464 int redc, const zn_mod_t mod);
465
466 #define zn_array_mulmid_KS2 \
467 ZNP_zn_array_mulmid_KS2
468 void
469 zn_array_mulmid_KS2 (ulong* res,
470 const ulong* op1, size_t n1,
471 const ulong* op2, size_t n2,
472 int redc, const zn_mod_t mod);
473
474 #define zn_array_mulmid_KS3 \
475 ZNP_zn_array_mulmid_KS3
476 void
477 zn_array_mulmid_KS3 (ulong* res,
478 const ulong* op1, size_t n1,
479 const ulong* op2, size_t n2,
480 int redc, const zn_mod_t mod);
481
482 #define zn_array_mulmid_KS4 \
483 ZNP_zn_array_mulmid_KS4
484 void
485 zn_array_mulmid_KS4 (ulong* res,
486 const ulong* op1, size_t n1,
487 const ulong* op2, size_t n2,
488 int redc, const zn_mod_t mod);
489
490
491
492 /* ============================================================================
493
494 pmf_t stuff
495
496 ============================================================================ */
497
498
499 /*
500 Let R = Z/mZ. A pmf_t ("pmf" = "polynomial modulo fermat") represents an
501 element of S = R[Y]/(Y^M + 1). This is used as the coefficient ring in
502 the Schonhage/Nussbaumer FFT routines.
503
504 It's an array of ulongs of length M + 1, where M = 2^lgM is a power of two.
505 (The value M is not stored, the caller needs to keep track of it.)
506
507 The first value in the array is an integer b called the "bias" of the
508 representation.
509
510 The remaining M values are coefficients a_0, ..., a_{M-1}.
511
512 These together represent the polynomial
513
514 Y^b (a_0 + a_1 Y + ... + a_{M-1} Y^{M-1}).
515
516 Note that elements of S do not have a unique representation in this form;
517 in fact they have one possible representation for each value of b in
518 [0, 2M). (By allowing nonzero bias, we get more efficient in-place FFT
519 butterflies.) The stored bias value need not be in [0, 2M), but it is
520 interpreted mod 2M.
521
522 Currently the values a_i are always normalised into [0, m). Later we might
523 drop that restriction to obtain faster butterflies...
524
525 */
526 #define pmf_t \
527 ZNP_pmf_t
528 typedef ulong* pmf_t;
529
530 #define pmf_const_t \
531 ZNP_pmf_const_t
532 typedef const ulong* pmf_const_t;
533
534
535 /*
536 op := op * x
537 */
538 #define pmf_scalar_mul \
539 ZNP_pmf_scalar_mul
540 ZNP_INLINE void
pmf_scalar_mul(pmf_t op,ulong M,ulong x,const zn_mod_t mod)541 pmf_scalar_mul (pmf_t op, ulong M, ulong x, const zn_mod_t mod)
542 {
543 zn_array_scalar_mul (op + 1, op + 1, M, x, mod);
544 }
545
546
547 /*
548 op := 0, with bias reset to zero too
549 */
550 #define pmf_zero \
551 ZNP_pmf_zero
552 ZNP_INLINE void
pmf_zero(pmf_t op,ulong M)553 pmf_zero (pmf_t op, ulong M)
554 {
555 for (M++; M > 0; M--)
556 *op++ = 0;
557 }
558
559
560 /*
561 op := op / 2
562
563 Modulus must be odd.
564 */
565 #define pmf_divby2 \
566 ZNP_pmf_divby2
567 ZNP_INLINE void
pmf_divby2(pmf_t op,ulong M,const zn_mod_t mod)568 pmf_divby2 (pmf_t op, ulong M, const zn_mod_t mod)
569 {
570 ZNP_ASSERT (mod->m & 1);
571
572 for (op++; M > 0; M--, op++)
573 *op = zn_mod_divby2 (*op, mod);
574 }
575
576
577 /*
578 res := op
579 */
580 #define pmf_set \
581 ZNP_pmf_set
582 ZNP_INLINE void
pmf_set(pmf_t res,pmf_t op,ulong M)583 pmf_set (pmf_t res, pmf_t op, ulong M)
584 {
585 for (M++; M > 0; M--)
586 *res++ = *op++;
587 }
588
589
590 /*
591 op := Y^r * op
592 */
593 #define pmf_rotate \
594 ZNP_pmf_rotate
595 ZNP_INLINE void
pmf_rotate(pmf_t op,ulong r)596 pmf_rotate (pmf_t op, ulong r)
597 {
598 op[0] += r;
599 }
600
601
602 /*
603 op1 := op2 + op1
604 op2 := op2 - op1
605
606 Inputs must be [0, m); outputs will be in [0, m).
607 */
608 #define pmf_bfly \
609 ZNP_pmf_bfly
610 void
611 pmf_bfly (pmf_t op1, pmf_t op2, ulong M, const zn_mod_t mod);
612
613 /*
614 op1 := op1 + op2
615
616 Inputs must be [0, m); outputs will be in [0, m).
617 */
618 #define pmf_add \
619 ZNP_pmf_add
620 void
621 pmf_add (pmf_t op1, const pmf_t op2, ulong M, const zn_mod_t mod);
622
623 /*
624 op1 := op1 - op2
625
626 Inputs must be [0, m); outputs will be in [0, m).
627 */
628 #define pmf_sub \
629 ZNP_pmf_sub
630 void
631 pmf_sub (pmf_t op1, const pmf_t op2, ulong M, const zn_mod_t mod);
632
633
634
635
636 /*
637 These functions are exported just for profiling purposes:
638 */
639
640 #define zn_array_bfly_inplace \
641 ZNP_zn_array_bfly_inplace
642 void
643 zn_array_bfly_inplace (ulong* op1, ulong* op2, ulong n, const zn_mod_t mod);
644
645 #define zn_array_add_inplace \
646 ZNP_zn_array_add_inplace
647 void
648 zn_array_add_inplace (ulong* op1, const ulong* op2, ulong n,
649 const zn_mod_t mod);
650
651 #define zn_array_sub_inplace \
652 ZNP_zn_array_sub_inplace
653 void
654 zn_array_sub_inplace (ulong* op1, const ulong* op2, ulong n,
655 const zn_mod_t mod);
656
657
658
659 /* ============================================================================
660
661 pmfvec_t stuff
662
663 ============================================================================ */
664
665
666 /*
667 A pmfvec_t stores a vector of length K = 2^lgK of elements of S.
668
669 Used to represent an element of S[Z]/(Z^K + 1) or S[Z]/(Z^K - 1), or some
670 other quotient like that.
671
672 The functions pmfvec_init/clear should be used to allocate storage for
673 this type. Also sometimes fake ones get created temporarily to point at
674 sub-vectors of existing vectors.
675 */
676 #define pmfvec_struct \
677 ZNP_pmfvec_struct
678 typedef struct
679 {
680 // points to the first coefficient
681 pmf_t data;
682
683 // number of coefficients
684 ulong K;
685 unsigned lgK; // lg2(K)
686
687 // length of coefficients (see definition of pmf_t)
688 ulong M;
689 unsigned lgM; // lg2(M)
690
691 // distance between adjacent coefficients, measured in ulongs
692 // (this is at least M + 1, might be more)
693 ptrdiff_t skip;
694
695 // associated modulus
696 const zn_mod_struct* mod;
697 }
698 pmfvec_struct;
699
700 #define pmfvec_t \
701 ZNP_pmfvec_t
702 typedef pmfvec_struct pmfvec_t[1];
703
704
705 /*
706 Checks that vec1 and vec2 have compatible data,
707 i.e. have the same K, M, mod.
708 */
709 #define pmfvec_compatible \
710 ZNP_pmfvec_compatible
711 ZNP_INLINE int
pmfvec_compatible(const pmfvec_t vec1,const pmfvec_t vec2)712 pmfvec_compatible (const pmfvec_t vec1, const pmfvec_t vec2)
713 {
714 return (vec1->K == vec2->K) && (vec1->M == vec2->M) &&
715 (vec1->mod == vec2->mod);
716 }
717
718
719 /*
720 Initialises res with given parameters, allocates memory.
721 */
722 #define pmfvec_init \
723 ZNP_pmfvec_init
724 void
725 pmfvec_init (pmfvec_t res, unsigned lgK, ptrdiff_t skip, unsigned lgM,
726 const zn_mod_t mod);
727
728
729 /*
730 Initialises res in preparation for a Nussbaumer multiplication of
731 length 2^lgL.
732 */
733 #define pmfvec_init_nuss \
734 ZNP_pmfvec_init_nuss
735 void
736 pmfvec_init_nuss (pmfvec_t res, unsigned lgL, const zn_mod_t mod);
737
738
739 /*
740 Destroys op, frees all associated memory.
741 */
742 #define pmfvec_clear \
743 ZNP_pmfvec_clear
744 void
745 pmfvec_clear (pmfvec_t op);
746
747
748 /*
749 res := op
750 */
751 #define pmfvec_set \
752 ZNP_pmfvec_set
753 void
754 pmfvec_set (pmfvec_t res, const pmfvec_t op);
755
756
757 /*
758 Multiplies first n coefficients of op by x.
759 */
760 #define pmfvec_scalar_mul \
761 ZNP_pmfvec_scalar_mul
762 void
763 pmfvec_scalar_mul (pmfvec_t op, ulong n, ulong x);
764
765
766 /*
767 Multiplies pointwise the first n coefficients of op1 and op2, puts result
768 in res.
769
770 It's okay for res to alias op1 or op2. The modulus must be odd.
771
772 If the special_first_two flag is set, the routine assumes that the first
773 two coefficients are of length only M/2 (this is the typical situation
774 after performing the FFT), and multiplies them more quickly accordingly.
775
776 The routine automatically selects KS or Nussbaumer multiplication
777 depending on the modulus bitsize and on M.
778
779 The output will be divided by a fudge factor, which can be retrieved
780 via pmfvec_mul_fudge().
781
782 Automatically uses specialised squaring algorithm if the inputs are the
783 same pmfvec_t object.
784 */
785 #define pmfvec_mul \
786 ZNP_pmfvec_mul
787 void
788 pmfvec_mul (pmfvec_t res, const pmfvec_t op1, const pmfvec_t op2, ulong n,
789 int special_first_two);
790
791 #define pmfvec_mul_fudge \
792 ZNP_pmfvec_mul_fudge
793 ulong
794 pmfvec_mul_fudge (unsigned lgM, int sqr, const zn_mod_t mod);
795
796
797 /*
798 Modifies the op->data and op->skip to make it look as if the first
799 coefficient is the one at index n - 1, and the last coefficient is
800 the one at index 0. Calling this function again undoes the reversal.
801 Note that this function *must* be called a second time before calling
802 pmfvec_clear(), so that free() is not called on the wrong pointer!
803 */
804 #define pmfvec_reverse \
805 ZNP_pmfvec_reverse
806 void
807 pmfvec_reverse (pmfvec_t op, ulong n);
808
809
810
811 /* ============================================================================
812
813 stuff in pmfvec_fft.c
814
815 ============================================================================ */
816
817 /*
818 ---------- forward FFTs ----------
819
820 The functions
821
822 pmfvec_fft()
823 pmfvec_fft_basecase()
824 pmfvec_fft_dc()
825 pmfvec_fft_huge()
826
827 operate on a pmfvec_t, and compute inplace the FFT:
828
829 b_k = Y^{t k'} \sum_{i=0}^{K-1} Y^{2 M i k' / K} a_i,
830
831 where 0 <= t < 2M / K is a twist parameter. The notation k' indicates the
832 length lgK bit-reversal of k.
833
834 All except the "basecase" version have an n parameter; they only compute the
835 first n outputs. The remaining buffers are used in intermediate steps, and
836 contain junk at the end. For "basecase", all K outputs are computed.
837
838 All except the "basecase" version have a z parameter; they assume that the
839 input coefficients are zero from index z and beyond. They never read from
840 those coefficients. For "basecase", all the inputs are used.
841
842 The four versions use different algorithms as follows:
843
844 * pmfvec_fft(): main entry point, delegates to one of the other routines
845 based on the size of the transform.
846
847 * pmfvec_fft_basecase(): low-overhead iterative FFT, no truncation logic.
848
849 * pmfvec_fft_dc(): divide-and-conquer. It handles the top layer of
850 butterflies, and then recurses into the two halves. This is intended
851 for fairly small transforms where locality is not a big issue. The
852 algorithm implemented here is essentially van der Hoeven's "truncated
853 Fourier transform" [vdH04], [vdH05].
854
855 * pmfvec_fft_huge(): intended for large transforms, where locality
856 is an issue. It factors the FFT into U = 2^lgU transforms of length
857 T = 2^lgT followed by T transforms of length U, where K = T * U. This
858 is done recursively until we're in L1 cache (or as small as possible),
859 at which point we switch to pmfvec_fft_dc().
860
861 The algorithm is straightforward, but I believe it to be new. It is
862 simultaneously a generalisation of van der Hoeven's truncated Fourier
863 transform and Bailey's FFT algorithm [Bai89]. (I used something
864 similar in the ZmodF_poly module in FLINT.)
865
866
867 ---------- inverse FFTs ----------
868
869 The functions
870
871 pmfvec_ifft()
872 pmfvec_ifft_basecase()
873 pmfvec_ifft_dc()
874 pmfvec_ifft_huge()
875
876 compute the corresponding inverse FFT. They are a little more complicated
877 than the FFTs owing to the truncation.
878
879 Let a_i and b_k be as described above for the FFTs. The IFFT functions
880 take as input the array
881
882 b_{0'}, b_{1'}, ..., b_{(n-1)'}, K*a_n, K*a_{n+1}, ..., K*a_{K-1},
883
884 for some 0 <= n <= K. If the fwd flag is zero, then the output of the IFFT
885 routine is:
886
887 K*a_0, K*a_1, ..., K*a_{n-1}
888
889 followed by K - n junk coefficients. If the fwd flag is set, then we
890 require that 0 <= n <= K - 1, and the output is
891
892 K*a_0, K*a_1, ..., K*a_{n-1}, b_n
893
894 followed by K - n - 1 junk coefficients; i.e. it also computes one
895 coefficient of the *forward* FFT.
896
897 The "basecase" version has no n parameter, and assumes that n = K. Here
898 the routine becomes the (non-truncated) IFFT as usually understood, with
899 inputs in bit-reversed order and outputs in normal order.
900
901 All except the "basecase" version have a z parameter (with z >= n); they
902 assume that the input coefficients are zero from index z and beyond. They
903 never read from those coefficients. For the "basecase" version, all of the
904 inputs are used.
905
906 The four versions use different algorithms as follows:
907
908 * pmfvec_ifft(): main entry point, delegates to one of the other routines
909 based on the size of the transform.
910
911 * pmfvec_ifft_basecase(): low-overhead iterative IFFT, no truncation
912 logic.
913
914 * pmfvec_ifft_dc(): divide-and-conquer. It recurses into the two halves,
915 and handles the top layer of butterflies. This is intended for fairly
916 small transforms where locality is not a big issue. The algorithm
917 implemented here is essentially van der Hoeven's beautiful "truncated
918 inverse Fourier transform".
919
920 * pmfvec_ifft_huge(): intended for large transforms, where locality
921 is an issue. It factors the FFT into U = 2^lgU transforms of length
922 T = 2^lgT and T transforms of length U, where K = T * U. This is done
923 recursively until we're in L1 cache (if possible), at which point we
924 switch to pmfvec_ifft_dc().
925
926 The algorithm is not as simple as the FFT version; it is necessary
927 to alternate between "row" and "column" transforms in a slightly
928 complicated way. I believe the algorithm to be new.
929
930
931 ---------- transposed forward and inverse FFTs ----------
932
933 The functions
934
935 pmfvec_tpfft_basecase()
936 pmfvec_tpfft_dc()
937 pmfvec_tpfft_huge()
938 pmfvec_tpfft()
939
940 are *transposed* versions of the corresponding pmfvec_fft() routines. This
941 means that if the FFT computes an S-linear map from S^z to S^n, the
942 transposed version computes the transpose map from S^n to S^z.
943
944 Similarly, the functions
945
946 pmfvec_tpifft_basecase()
947 pmfvec_tpifft_dc()
948 pmfvec_tpifft_huge()
949 pmfvec_tpifft()
950
951 are transposed versions of the IFFT routines. If the IFFT computes an
952 S-linear map from S^z to S^(n + fwd), the transposed version computes the
953 transpose map from S^(n + fwd) to S^z.
954
955 The algorithms are transposed essentially by reversing them, and
956 transposing every step of the algorithm; see for example [BLS03] for how
957 this is done. We don't have comments in these routines; see the comments
958 on the corresponding FFT/IFFT routines.
959
960
961 ---------- notes for all the above functions ----------
962
963 These functions all perform O(n * lgK + K) operations in S (an "operation"
964 being an addition/subtraction/copy with possibly an implied rotation by a
965 power of Y). In particular the running time varies fairly smoothly with
966 n instead of jumping with K.
967
968 For all four algorithms, the "dc" version is essentially equivalent to the
969 "huge" version with lgT = 1.
970
971 These functions are not thread-safe. Apart from modifying the input inplace,
972 they also temporarily modify the pmfvec_t structs themselves.
973
974 Our approach to improving cache performance is certainly not ideal. The
975 main problem is that we can get address conflicts, especially since
976 everything gets spread out by powers of two. Mitigating factors:
977 associativity in the cache; the extra bias word scrambles the addresses
978 somewhat; when the transforms gets large, so do the coefficients, so we
979 don't expect to fit that many in cache anyway.
980
981 */
982
983
984 #define pmfvec_fft \
985 ZNP_pmfvec_fft
986 void
987 pmfvec_fft (pmfvec_t op, ulong n, ulong z, ulong t);
988
989 #define pmfvec_fft_huge \
990 ZNP_pmfvec_fft_huge
991 void
992 pmfvec_fft_huge (pmfvec_t op, unsigned lgT, ulong n, ulong z, ulong t);
993
994 #define pmfvec_fft_dc \
995 ZNP_pmfvec_fft_dc
996 void
997 pmfvec_fft_dc (pmfvec_t op, ulong n, ulong z, ulong t);
998
999 #define pmfvec_fft_basecase \
1000 ZNP_pmfvec_fft_basecase
1001 void
1002 pmfvec_fft_basecase (pmfvec_t op, ulong t);
1003
1004 #define pmfvec_ifft \
1005 ZNP_pmfvec_ifft
1006 void
1007 pmfvec_ifft (pmfvec_t op, ulong n, int fwd, ulong z, ulong t);
1008
1009 #define pmfvec_ifft_huge \
1010 ZNP_pmfvec_ifft_huge
1011 void
1012 pmfvec_ifft_huge (pmfvec_t op, unsigned lgT, ulong n, int fwd, ulong z,
1013 ulong t);
1014
1015 #define pmfvec_ifft_dc \
1016 ZNP_pmfvec_ifft_dc
1017 void
1018 pmfvec_ifft_dc (pmfvec_t op, ulong n, int fwd, ulong z, ulong t);
1019
1020 #define pmfvec_ifft_basecase \
1021 ZNP_pmfvec_ifft_basecase
1022 void
1023 pmfvec_ifft_basecase (pmfvec_t op, ulong t);
1024
1025 #define pmfvec_tpfft \
1026 ZNP_pmfvec_tpfft
1027 void
1028 pmfvec_tpfft (pmfvec_t op, ulong n, ulong z, ulong t);
1029
1030 #define pmfvec_tpfft_huge \
1031 ZNP_pmfvec_tpfft_huge
1032 void
1033 pmfvec_tpfft_huge (pmfvec_t op, unsigned lgT, ulong n, ulong z, ulong t);
1034
1035 #define pmfvec_tpfft_dc \
1036 ZNP_pmfvec_tpfft_dc
1037 void
1038 pmfvec_tpfft_dc (pmfvec_t op, ulong n, ulong z, ulong t);
1039
1040 #define pmfvec_tpfft_basecase \
1041 ZNP_pmfvec_tpfft_basecase
1042 void
1043 pmfvec_tpfft_basecase (pmfvec_t op, ulong t);
1044
1045 #define pmfvec_tpifft \
1046 ZNP_pmfvec_tpifft
1047 void
1048 pmfvec_tpifft (pmfvec_t op, ulong n, int fwd, ulong z, ulong t);
1049
1050 #define pmfvec_tpifft_huge \
1051 ZNP_pmfvec_tpifft_huge
1052 void
1053 pmfvec_tpifft_huge (pmfvec_t op, unsigned lgT, ulong n, int fwd, ulong z,
1054 ulong t);
1055
1056 #define pmfvec_tpifft_dc \
1057 ZNP_pmfvec_tpifft_dc
1058 void
1059 pmfvec_tpifft_dc (pmfvec_t op, ulong n, int fwd, ulong z, ulong t);
1060
1061 #define pmfvec_tpifft_basecase \
1062 ZNP_pmfvec_tpifft_basecase
1063 void
1064 pmfvec_tpifft_basecase (pmfvec_t op, ulong t);
1065
1066
1067
1068 /* ============================================================================
1069
1070 stuff in array.c
1071
1072 ============================================================================ */
1073
1074
1075 /*
1076 Computes
1077
1078 res = sign1*op1 + sign2*op2,
1079
1080 where sign1 = -1 if neg1 is set, otherwise +1; ditto for sign2.
1081
1082 op1 and op2 are arrays of length n.
1083
1084 res is a staggered array, entries separated by s.
1085
1086 Return value is res + s*n, i.e. points beyond the written array.
1087 */
1088 #define zn_skip_array_signed_add \
1089 ZNP_zn_skip_array_signed_add
1090 ulong*
1091 zn_skip_array_signed_add (ulong* res, ptrdiff_t skip, size_t n,
1092 const ulong* op1, int neg1,
1093 const ulong* op2, int neg2,
1094 const zn_mod_t mod);
1095
1096
1097 /*
1098 Same as zn_array_scalar_mul, but has a _redc_ flag. If the flag is set,
1099 then REDC reduction is used (in which case the modulus must be odd),
1100 otherwise ordinary reduction is used.
1101 */
1102 #define _zn_array_scalar_mul \
1103 ZNP__zn_array_scalar_mul
1104 void
1105 _zn_array_scalar_mul (ulong* res, const ulong* op, size_t n, ulong x,
1106 int redc, const zn_mod_t mod);
1107
1108
1109 /*
1110 Behaves just like zn_array_scalar_mul, except it uses the obvious
1111 optimisation if x == 1.
1112 */
1113 #define zn_array_scalar_mul_or_copy \
1114 ZNP_zn_array_scalar_mul_or_copy
1115 void
1116 zn_array_scalar_mul_or_copy (ulong* res, const ulong* op, size_t n,
1117 ulong x, const zn_mod_t mod);
1118
1119
1120 /* ============================================================================
1121
1122 stuff in nuss.c
1123
1124 ============================================================================ */
1125
1126
1127 /*
1128 Performs negacyclic multiplication using Nussbaumer's algorithm.
1129
1130 vec1 and vec2 must be pre-initialised pmfvec_t's with the same
1131 modulus and the same lgM and lgK, satisfying lgM + 1 >= lgK (i.e. there
1132 are enough roots of unity). These are used for scratch space.
1133
1134 The convolution length is L = 2^lgL, where lgL = lgM + lgK - 1.
1135
1136 Inputs are op1[0, L) and op2[0, L), output is res[0, L). It's okay for res
1137 to alias op1 or op2.
1138
1139 If op1 == op2, then a faster squaring version is used. In this case
1140 vec2 is ignored.
1141
1142 The result comes out divided by a fudge factor, which can be recovered
1143 via nuss_mul_fudge().
1144 */
1145 #define nuss_mul \
1146 ZNP_nuss_mul
1147 void
1148 nuss_mul (ulong* res, const ulong* op1, const ulong* op2,
1149 pmfvec_t vec1, pmfvec_t vec2);
1150
1151 #define nuss_mul_fudge \
1152 ZNP_nuss_mul_fudge
1153 ulong
1154 nuss_mul_fudge (unsigned lgL, int sqr, const zn_mod_t mod);
1155
1156
1157 /*
1158 Computes optimal lgK and lgM for given lgL, as described above for
1159 nuss_mul().
1160 */
1161 #define nuss_params \
1162 ZNP_nuss_params
1163 void
1164 nuss_params (unsigned* lgK, unsigned* lgM, unsigned lgL);
1165
1166
1167
1168 /* ============================================================================
1169
1170 stuff from mul_fft.c
1171
1172 ============================================================================ */
1173
1174
1175 /*
1176 Splits op[-k, n) into pieces of length M/2, where M = res->M, and where
1177 the first k coefficients are assumed to be zero. The pieces are written to
1178 the first ceil((n + k) / (M/2)) coefficients of res. The last fractional
1179 piece is treated as if zero-padded up to length M/2. The second half of
1180 each target pmf_t is zeroed out, and the bias fields are all set to b.
1181
1182 If x != 1, then all entries are multiplied by x mod m.
1183 */
1184 #define fft_split \
1185 ZNP_fft_split
1186 void
1187 fft_split (pmfvec_t res, const ulong* op, size_t n, size_t k, ulong x,
1188 ulong b);
1189
1190
1191 /*
1192 Performs the substitution back from S[Z]/(Z^K - 1) to a polynomial in X,
1193 i.e. mapping Y -> X, Z -> X^(M/2). It only looks at the first z coefficients
1194 of op; it assumes the rest are zero. It writes exactly n coefficients of
1195 output. If skip_first is set, it ignores the first M/2 coefficients of
1196 output, and then writes the *next* M/2 coefficients of output.
1197
1198 NOTE: this routine is not threadsafe: it temporarily modifies the bias
1199 field of the first coefficient of op.
1200 */
1201 #define fft_combine \
1202 ZNP_fft_combine
1203 void
1204 fft_combine (ulong* res, size_t n, const pmfvec_t op, ulong z,
1205 int skip_first);
1206
1207
1208
1209 /*
1210 Same as zn_array_mul(), but uses the Schonhage/Nussbaumer FFT algorithm.
1211
1212 Uses faster algorithm for squaring if inputs are identical buffers.
1213
1214 The modulus must be odd.
1215
1216 Output may overlap the inputs.
1217
1218 The output will come out divided by a fudge factor, which can be recovered
1219 via zn_array_mul_fft_fudge().
1220
1221 If x != 1, the output is further multiplied by x.
1222 */
1223 #define zn_array_mul_fft \
1224 ZNP_zn_array_mul_fft
1225 void
1226 zn_array_mul_fft (ulong* res,
1227 const ulong* op1, size_t n1,
1228 const ulong* op2, size_t n2,
1229 ulong x, const zn_mod_t mod);
1230
1231 #define zn_array_mul_fft_fudge \
1232 ZNP_zn_array_mul_fft_fudge
1233 ulong
1234 zn_array_mul_fft_fudge (size_t n1, size_t n2, int sqr, const zn_mod_t mod);
1235
1236
1237 /*
1238 Computes the best lgK, lgM, m1, m2 such that polynomials of length n1 and
1239 n2 may be multiplied with fourier transform parameters lgK and lgM, and
1240 where the polynomials get split into m1 (resp. m2) chunks of length M/2.
1241
1242 More precisely, the outputs satisfy:
1243
1244 * lgM + 1 >= lgK (i.e. there are enough roots of unity)
1245 * m1 + m2 - 1 <= K, where m1 = ceil(n1 / (M/2)) and m2 = ceil(n2 / (M/2))
1246 (i.e. the transform has enough room for the answer)
1247 * lgM >= 1
1248 * lgM is minimal subject to the above conditions.
1249 */
1250 #define mul_fft_params \
1251 ZNP_mul_fft_params
1252 void
1253 mul_fft_params (unsigned* lgK, unsigned* lgM, ulong* m1, ulong* m2,
1254 size_t n1, size_t n2);
1255
1256
1257 /*
1258 Same as zn_array_mulmid(), but uses the Schonhage/Nussbaumer FFT algorithm.
1259
1260 The modulus must be odd.
1261
1262 Output may overlap the inputs.
1263
1264 The output will come out divided by a fudge factor, which can be recovered
1265 via zn_array_mulmid_fft_fudge().
1266
1267 If x != 1, the output is further multiplied by x.
1268 */
1269 #define zn_array_mulmid_fft \
1270 ZNP_zn_array_mulmid_fft
1271 void
1272 zn_array_mulmid_fft (ulong* res,
1273 const ulong* op1, size_t n1,
1274 const ulong* op2, size_t n2,
1275 ulong x, const zn_mod_t mod);
1276
1277 #define zn_array_mulmid_fft_fudge \
1278 ZNP_zn_array_mulmid_fft_fudge
1279 ulong
1280 zn_array_mulmid_fft_fudge (size_t n1, size_t n2, const zn_mod_t mod);
1281
1282
1283 /*
1284 Computes the best lgK, lgM, m1, m2, p such that the middle product of
1285 polynomials of length n1 and n2 may be computed with fourier transform
1286 parameters lgK and lgM, where the first polynomial is padded on the left by
1287 p zeroes, and where the polynomials get split into m1 (resp. m2) chunks of
1288 length M/2.
1289
1290 More precisely, the outputs satisfy (see mul_fft.c for further discussion):
1291
1292 * lgM >= 1
1293 * lgM + 1 >= lgK (i.e. there are enough roots of unity)
1294 * 1 <= p <= M/2 and n2 + p - 1 is divisible by M/2
1295 * m1 = ceil((n1 + p) / (M/2))
1296 * m2 = ceil(n2 / (M/2))
1297 * m1 <= K
1298 * lgM is minimal subject to the above conditions.
1299 */
1300 #define mulmid_fft_params \
1301 ZNP_mulmid_fft_params
1302 void
1303 mulmid_fft_params (unsigned* lgK, unsigned* lgM, ulong* m1, ulong* m2,
1304 ulong* p, size_t n1, size_t n2);
1305
1306
1307 /*
1308 Stores precomputed information for performing an FFT-based middle product
1309 where the first input array is invariant, and the length of the second
1310 input array is invariant.
1311 */
1312 #define zn_array_mulmid_fft_precomp1_struct \
1313 ZNP_zn_array_mulmid_fft_precomp1_struct
1314 struct zn_array_mulmid_fft_precomp1_struct
1315 {
1316 // these parameters are as described at the top of mul_fft.c.
1317 size_t n1, n2;
1318 ulong m1, m2, p;
1319
1320 // holds the transposed IFFT of the input array
1321 pmfvec_t vec1;
1322 };
1323
1324 #define zn_array_mulmid_fft_precomp1_t \
1325 ZNP_zn_array_mulmid_fft_precomp1_t
1326 typedef struct zn_array_mulmid_fft_precomp1_struct
1327 zn_array_mulmid_fft_precomp1_t[1];
1328
1329
1330 /*
1331 Initialises res to perform middle product of op1[0, n1) by operands of
1332 size n2.
1333
1334 If x != 1, the data is multiplied by x. Since middle products are linear,
1335 this has the effect of multiplying the output of subsequent calls to
1336 zn_array_mulmid_fft_precomp1_execute() by x.
1337 */
1338 #define zn_array_mulmid_fft_precomp1_init \
1339 ZNP_zn_array_mulmid_fft_precomp1_init
1340 void
1341 zn_array_mulmid_fft_precomp1_init (zn_array_mulmid_fft_precomp1_t res,
1342 const ulong* op1, size_t n1, size_t n2,
1343 ulong x, const zn_mod_t mod);
1344
1345 /*
1346 Performs middle product of op1[0, n1) by op2[0, n2), stores result at
1347 res[0, n1 - n2 + 1).
1348
1349 The output will come out divided by a fudge factor, which can be recovered
1350 via zn_array_mulmid_fft_precomp1_fudge().
1351
1352 If x != 1, the output is further multiplied by x.
1353 */
1354 #define zn_array_mulmid_fft_precomp1_execute \
1355 ZNP_zn_array_mulmid_fft_precomp1_execute
1356 void
1357 zn_array_mulmid_fft_precomp1_execute
1358 (ulong* res, const ulong* op2, ulong x,
1359 const zn_array_mulmid_fft_precomp1_t precomp);
1360
1361 #define zn_array_mulmid_fft_precomp1_fudge \
1362 ZNP_zn_array_mulmid_fft_precomp1_fudge
1363 ulong
1364 zn_array_mulmid_fft_precomp1_fudge (size_t n1, size_t n2, const zn_mod_t mod);
1365
1366
1367 /*
1368 Deallocates op.
1369 */
1370 #define zn_array_mulmid_fft_precomp1_clear \
1371 ZNP_zn_array_mulmid_fft_precomp1_clear
1372 void
1373 zn_array_mulmid_fft_precomp1_clear (zn_array_mulmid_fft_precomp1_t op);
1374
1375
1376
1377 /* ============================================================================
1378
1379 stuff from mpn_mulmid.c
1380
1381 ============================================================================ */
1382
1383
1384 /*
1385 Let n1 >= n2 >= 1, and let
1386
1387 a = \sum_{i=0}^{n1-1} a_i B^i
1388 b = \sum_{j=0}^{n2-1} b_j B^j
1389
1390 be integers with n1 and n2 limbs respectively. We define SMP(a, b), the
1391 *simple* middle product of a and b, to be the integer
1392
1393 \sum_{0 <= i < n1, 0 <= j < n2, n2-1 <= i+j < n1} a_i b_j B^(i+j-(n2-1)).
1394
1395 In other words, it's as if we treat a and b as polynomials in Z[B] of
1396 length n1 and n2 respectively, compute the polynomial middle product over
1397 Z, and then propagate the high words and subsequent carries.
1398
1399 Note that SMP(a, b) is at most n1 - n2 + 3 limbs long (we assume throughout
1400 that n1 is less than the maximum value stored in a limb).
1401 */
1402
1403
1404 /*
1405 Computes SMP(op1[0, n1), op2[0, n2)).
1406
1407 Stores result at res[0, n1 - n2 + 3).
1408 */
1409 void
1410 ZNP_mpn_smp (mp_limb_t* res,
1411 const mp_limb_t* op1, size_t n1,
1412 const mp_limb_t* op2, size_t n2);
1413
1414
1415 /*
1416 Same as mpn_smp(), but always uses basecase (quadratic-time) algorithm.
1417
1418 res[0, n1 - n2 + 3) must not overlap op1[0, n1) or op2[0, n2).
1419 */
1420 void
1421 ZNP_mpn_smp_basecase (mp_limb_t* res,
1422 const mp_limb_t* op1, size_t n1,
1423 const mp_limb_t* op2, size_t n2);
1424
1425
1426 /*
1427 Computes SMP(op1[0, 2*n - 1), op2[0, n)). Algorithm is selected depending
1428 on size of n.
1429
1430 Stores result at res[0, n + 2). Output must not overlap inputs.
1431 */
1432 void
1433 ZNP_mpn_smp_n (mp_limb_t* res, const mp_limb_t* op1,
1434 const mp_limb_t* op2, size_t n);
1435
1436
1437 /*
1438 Computes SMP(op1[0, 2*n - 1), op2[0, n)), using Karatsuba algorithm.
1439
1440 Must have n >= 2.
1441
1442 Stores result at res[0, n + 2). Output must not overlap inputs.
1443 */
1444 void
1445 ZNP_mpn_smp_kara (mp_limb_t* res, const mp_limb_t* op1, const mp_limb_t* op2,
1446 size_t n);
1447
1448
1449
1450 /*
1451 Computes the *true* middle product of op1[0, n1) and op2[0, n2).
1452
1453 More precisely, let P be the product op1 * op2. This function computes
1454 P[n2 + 1, n1), and stores this at res[2, n1 - n2 + 1).
1455
1456 Must have n1 >= n2 >= 1.
1457
1458 The output buffer res *must* have room for n1 - n2 + 3 limbs, but the first
1459 two limbs and the last two limbs of the output will be *garbage*. In
1460 particular, only n1 - n2 - 1 limbs of useful output are produced. If
1461 n1 <= n2 + 1, then no useful output is produced.
1462 */
1463 void
1464 ZNP_mpn_mulmid (mp_limb_t* res,
1465 const mp_limb_t* op1, size_t n1,
1466 const mp_limb_t* op2, size_t n2);
1467
1468
1469 /*
1470 Same as mpn_mulmid, but always just falls back on using mpn_mul.
1471 */
1472 void
1473 ZNP_mpn_mulmid_fallback (mp_limb_t* res,
1474 const mp_limb_t* op1, size_t n1,
1475 const mp_limb_t* op2, size_t n2);
1476
1477
1478
1479
1480 /* ============================================================================
1481
1482 stuff from mulmid.c
1483
1484 ============================================================================ */
1485
1486
1487 /*
1488 Same as zn_array_mulmid(), but always falls back on doing the full product
1489 via _zn_array_mul().
1490
1491 If fastred is cleared, the output is the same as for zn_array_mulmid().
1492
1493 If fastred is set, the routine uses the fastest modular reduction strategy
1494 available for the given parameters. The result will come out divided by a
1495 fudge factor, which can be recovered via
1496 _zn_array_mulmid_fallback_fudge().
1497 */
1498 #define zn_array_mulmid_fallback \
1499 ZNP_zn_array_mulmid_fallback
1500 void
1501 zn_array_mulmid_fallback (ulong* res,
1502 const ulong* op1, size_t n1,
1503 const ulong* op2, size_t n2,
1504 int fastred, const zn_mod_t mod);
1505
1506 #define zn_array_mulmid_fallback_fudge \
1507 ZNP_zn_array_mulmid_fallback_fudge
1508 ulong
1509 zn_array_mulmid_fallback_fudge (size_t n1, size_t n2, const zn_mod_t mod);
1510
1511
1512 /*
1513 Identical to zn_array_mulmid(), except for the fastred flag.
1514
1515 If fastred is cleared, the output is the same as for zn_array_mulmid().
1516
1517 If fastred is set, the routine uses the fastest modular reduction strategy
1518 available for the given parameters. The result will come out divided by a
1519 fudge factor, which can be recovered via _zn_array_mulmid_fudge().
1520 */
1521 #define _zn_array_mulmid \
1522 ZNP__zn_array_mulmid
1523 void
1524 _zn_array_mulmid (ulong* res,
1525 const ulong* op1, size_t n1,
1526 const ulong* op2, size_t n2,
1527 int fastred, const zn_mod_t mod);
1528
1529 #define _zn_array_mulmid_fudge \
1530 ZNP__zn_array_mulmid_fudge
1531 ulong
1532 _zn_array_mulmid_fudge (size_t n1, size_t n2, const zn_mod_t mod);
1533
1534
1535 /* ============================================================================
1536
1537 other random stuff
1538
1539 ============================================================================ */
1540
1541
1542 /*
1543 Compute 2^k mod m.
1544
1545 Modulus must be odd.
1546
1547 Must have -ULONG_BITS < k < ULONG_BITS.
1548 */
1549 #define zn_mod_pow2 \
1550 ZNP_zn_mod_pow2
1551 ulong
1552 zn_mod_pow2 (int k, const zn_mod_t mod);
1553
1554
1555
1556 /*
1557 Constants describing algorithms for precomputed middle products
1558 */
1559 #define ZNP_MULMID_ALGO_FALLBACK 0
1560 #define ZNP_MULMID_ALGO_KS 1
1561 #define ZNP_MULMID_ALGO_FFT 2
1562
1563
1564
1565 #ifdef __cplusplus
1566 }
1567 #endif
1568
1569 #endif
1570
1571 // end of file ****************************************************************
1572