1 /*
2 pmfvec_fft.c: FFT/IFFT and transposed FFT/IFFT routines for pmfvec_t
3
4 Copyright (C) 2007, 2008, David Harvey
5
6 This file is part of the zn_poly library (version 0.9).
7
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 2 of the License, or
11 (at your option) version 3 of the License.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20
21 */
22
23 #include <stdio.h>
24 #include "zn_poly_internal.h"
25
26
27 /* ============================================================================
28
29 FFT routines
30
31 ============================================================================ */
32
33
34 void
pmfvec_fft_basecase(pmfvec_t op,ulong t)35 pmfvec_fft_basecase (pmfvec_t op, ulong t)
36 {
37 ZNP_ASSERT (op->lgK <= op->lgM + 1);
38 ZNP_ASSERT (t * op->K < 2 * op->M);
39
40 if (op->lgK == 0)
41 return;
42
43 // just plain butterfly loop
44
45 ulong M = op->M;
46 const zn_mod_struct* mod = op->mod;
47
48 ulong s, r = M >> (op->lgK - 1);
49 ptrdiff_t half = op->skip << (op->lgK - 1);
50 ulong* end = op->data + (op->skip << op->lgK);
51 ulong* p;
52 ulong* start;
53
54 for (; r <= M; r <<= 1, half >>= 1, t <<= 1)
55 for (start = op->data, s = t; s < M; s += r, start += op->skip)
56 for (p = start; p < end; p += 2 * half)
57 {
58 pmf_bfly (p, p + half, M, mod);
59 pmf_rotate (p + half, M + s);
60 }
61 }
62
63
64
65 void
pmfvec_fft_dc(pmfvec_t op,ulong n,ulong z,ulong t)66 pmfvec_fft_dc (pmfvec_t op, ulong n, ulong z, ulong t)
67 {
68 ZNP_ASSERT (op->lgK <= op->lgM + 1);
69 ZNP_ASSERT (t * op->K < 2 * op->M);
70 ZNP_ASSERT (n >= 1 && n <= op->K);
71 ZNP_ASSERT (z >= 1 && z <= op->K);
72
73 if (op->K == 1)
74 return;
75
76 if (n == op->K && z == op->K)
77 {
78 // No truncation requested; use iterative version
79 pmfvec_fft_basecase (op, t);
80 return;
81 }
82
83 const zn_mod_struct* mod = op->mod;
84
85 // We treat the input as two rows and U = K/2 columns, in row-major order.
86
87 // descend to first row (first half of op)
88 op->lgK--;
89 op->K >>= 1;
90
91 long i;
92 ulong M = op->M;
93 ulong U = op->K;
94 ulong* p = op->data;
95 ptrdiff_t skip = op->skip;
96 ptrdiff_t half = skip << op->lgK;
97 ulong z2 = ZNP_MIN (z, U);
98
99 if (n <= U)
100 {
101 // Only need the first output of the first layer of butterflies.
102 for (i = 0; i < (long)(z - U); i++, p += skip)
103 pmf_add (p, p + half, M, mod);
104
105 // Recurse into top row
106 pmfvec_fft_dc (op, n, z2, t << 1);
107 }
108 else
109 {
110 // Need both outputs from the first layer of butterflies.
111 ulong s = t;
112 ulong r = M >> op->lgK;
113
114 for (i = 0; i < (long)(z - U); i++, p += skip, s += r)
115 {
116 pmf_bfly (p, p + half, M, mod);
117 pmf_rotate (p + half, M + s);
118 }
119
120 // Butterflies where second input is zero
121 for (; i < z2; i++, p += skip, s += r)
122 {
123 pmf_set (p + half, p, M);
124 pmf_rotate (p + half, s);
125 }
126
127 // Recurse into top row...
128 pmfvec_fft_dc (op, U, z2, t << 1);
129
130 // ... and recurse into bottom row
131 op->data += half;
132 pmfvec_fft_dc (op, n - U, z2, t << 1);
133 op->data -= half;
134 }
135
136 // pop back to whole transform
137 op->K <<= 1;
138 op->lgK++;
139 }
140
141
142
143 /*
144 As described above, this splits the length K transform into T = 2^lgT rows
145 by U = 2^lgU columns, where K = U * T.
146
147 Must have 0 < lgT < lgK.
148 */
149 void
pmfvec_fft_huge(pmfvec_t op,unsigned lgT,ulong n,ulong z,ulong t)150 pmfvec_fft_huge (pmfvec_t op, unsigned lgT, ulong n, ulong z, ulong t)
151 {
152 ZNP_ASSERT (op->lgK <= op->lgM + 1);
153 ZNP_ASSERT (t * op->K < 2 * op->M);
154 ZNP_ASSERT (lgT > 0 && lgT < op->lgK);
155 ZNP_ASSERT (n >= 1 && n <= op->K);
156 ZNP_ASSERT (z >= 1 && z <= op->K);
157
158 unsigned lgK = op->lgK;
159 unsigned lgU = lgK - lgT;
160
161 ulong K = op->K;
162 ulong T = 1UL << lgT;
163 ulong U = 1UL << lgU;
164
165 ptrdiff_t skip = op->skip;
166 ptrdiff_t skip_U = skip << lgU;
167
168 ulong* data = op->data;
169
170 // We need n output coefficients, starting from the top-left, in row-major
171 // order.
172
173 // Write n = U * nT + nU, where 0 <= nU < U
174 ulong nU = n & (U - 1);
175 ulong nT = n >> lgU;
176
177 // nT_ceil = number of rows of output, including the last partial row
178 ulong nT_ceil = nT + (nU > 0);
179
180 // Write z = U * zT + zU, where 0 <= zU < U
181 ulong zT = z >> lgU;
182 ulong zU = z & (U - 1);
183 ulong zU2 = zT ? U : zU;
184
185 ulong r = op->M >> (lgK - 1);
186 ulong s, i;
187
188 // --------------- FFTs along columns
189
190 op->K = T;
191 op->lgK = lgT;
192 op->skip = skip_U;
193
194 // First handle the columns with zT + 1 input coefficients.
195 for (i = 0, s = t; i < zU; i++, op->data += skip, s += r)
196 pmfvec_fft (op, nT_ceil, zT + 1, s);
197
198 // Handle the remaining columns, which only have zT input coefficients.
199 for (; i < zU2; i++, op->data += skip, s += r)
200 pmfvec_fft (op, nT_ceil, zT, s);
201
202 // --------------- FFTs along rows
203
204 op->data = data;
205 op->K = U;
206 op->lgK = lgU;
207 op->skip = skip;
208 t <<= lgT;
209
210 // Handle the first nT rows.
211 for (i = 0; i < nT; i++, op->data += skip_U)
212 pmfvec_fft (op, U, zU2, t);
213
214 // For the last row, we only need the first nU outputs:
215 if (nU)
216 pmfvec_fft (op, nU, zU2, t);
217
218 // --------------- restore parameters
219
220 op->data = data;
221 op->K = K;
222 op->lgK = lgK;
223 }
224
225
226
227 void
pmfvec_fft(pmfvec_t op,ulong n,ulong z,ulong t)228 pmfvec_fft (pmfvec_t op, ulong n, ulong z, ulong t)
229 {
230 ZNP_ASSERT (op->lgK <= op->lgM + 1);
231 ZNP_ASSERT (t * op->K < 2 * op->M);
232 ZNP_ASSERT (n >= 1 && n <= op->K);
233 ZNP_ASSERT (z >= 1 && z <= op->K);
234
235 if (op->K <= 2 || 2 * op->K * op->M * sizeof (ulong) <= ZNP_CACHE_SIZE)
236 {
237 // FFT is pretty small; use divide-and-conquer
238 pmfvec_fft_dc (op, n, z, t);
239 }
240 else
241 {
242 // FFT is relatively big; use factoring algorithm instead
243 pmfvec_fft_huge (op, op->lgK / 2, n, z, t);
244 }
245 }
246
247
248
249 /* ============================================================================
250
251 inverse FFT routines
252
253 ============================================================================ */
254
255 void
pmfvec_ifft_basecase(pmfvec_t op,ulong t)256 pmfvec_ifft_basecase (pmfvec_t op, ulong t)
257 {
258 ZNP_ASSERT (op->lgK <= op->lgM + 1);
259 ZNP_ASSERT (t * op->K < 2 * op->M);
260
261 if (op->lgK == 0)
262 return;
263
264 // just plain butterfly loop
265
266 ulong M = op->M;
267 const zn_mod_struct* mod = op->mod;
268
269 ulong s, r = M;
270 ulong r_last = M >> (op->lgK - 1);
271 t <<= (op->lgK - 1);
272 ptrdiff_t half = op->skip;
273 ulong* end = op->data + (op->skip << op->lgK);
274 ulong* p;
275 ulong* start;
276
277 for (; r >= r_last; r >>= 1, half <<= 1, t >>= 1)
278 for (start = op->data, s = t; s < M; s += r, start += op->skip)
279 for (p = start; p < end; p += 2 * half)
280 {
281 pmf_rotate (p + half, M - s);
282 pmf_bfly (p + half, p, M, mod);
283 }
284 }
285
286
287
288 void
pmfvec_ifft_dc(pmfvec_t op,ulong n,int fwd,ulong z,ulong t)289 pmfvec_ifft_dc (pmfvec_t op, ulong n, int fwd, ulong z, ulong t)
290 {
291 ZNP_ASSERT (op->lgK <= op->lgM + 1);
292 ZNP_ASSERT (t * op->K < 2 * op->M);
293 ZNP_ASSERT (z >= 1 && z <= op->K);
294 ZNP_ASSERT (n + fwd >= 1 && n + fwd <= op->K);
295 ZNP_ASSERT (n <= z);
296
297 if (op->K == 1)
298 return;
299
300 if (n == op->K)
301 {
302 // No truncation requested; use iterative version
303 pmfvec_ifft_basecase (op, t);
304 return;
305 }
306
307 const zn_mod_struct* mod = op->mod;
308
309 // We treat the input as two rows and U = K / 2 columns, in row-major order.
310
311 // descend to first row (first half of op)
312 op->K >>= 1;
313 op->lgK--;
314
315 ulong M = op->M;
316 ulong U = op->K;
317 ptrdiff_t skip = op->skip;
318 ptrdiff_t half = skip << op->lgK;
319
320 // symbols in the following diagrams:
321 // A = fully untransformed coefficient (one of the a_i)
322 // B = intermediate coefficient
323 // C = fully transformed coefficient (one of the b_k)
324 // a, b, c = same as three above, but implied zero
325 // ? = garbage that we don't care about
326 // * = the "forward" C coefficient, or "?" if no forward coefficient
327 // requested
328
329 // The horizontal transforms convert between B and C.
330 // The vertical butterflies convert between A and B.
331
332 if (n + fwd <= U)
333 {
334 // The input could look like one of the following:
335 // CCCCAAAA CCCCAAAA CCCCAAaa CCCCaaaa
336 // AAAAAAaa or AAaaaaaa or aaaaaaaa or aaaaaaaa
337
338 long zU2 = ZNP_MIN (z, U);
339 long last_zero_fwd_bfly = ZNP_MAX (z - zU2, n);
340 long last_zero_cross_bfly = ZNP_MIN (z - zU2, n);
341
342 long i = zU2 - 1;
343 pmf_t p = op->data + skip * i;
344
345 // First some forward butterflies ("Aa" => "B?") to make them look like:
346 // CCCCAABB CCCCBBBB CCCCBBaa CCCCaaaa
347 // AAAAAA?? or AAaa???? or aaaa??aa or aaaaaaaa
348 for (; i >= last_zero_fwd_bfly; i--, p -= skip)
349 {
350 // (2*a0, ?) -> (a0, ?) = (b0, ?)
351 pmf_divby2 (p, M, mod);
352 }
353
354 // Then some forward butterflies ("AA" => "B?") to make them look like:
355 // CCCCBBBB CCCCBBBB CCCCBBaa CCCCaaaa
356 // AAAA???? or AAaa???? or aaaa??aa or aaaaaaaa
357 for (; i >= (long) n; i--, p -= skip)
358 {
359 // (2*a0, 2*a1) -> (a0 + a1, ?) = (b0, ?)
360 pmf_add (p, p + half, M, mod);
361 pmf_divby2 (p, M, mod);
362 }
363
364 // Transform the first row to make them look like:
365 // BBBB*??? BBBB*??? BBBB*??? BBBB*???
366 // AAAA???? or AAaa???? or aaaa??aa or aaaaaaaa
367 pmfvec_ifft_dc (op, n, fwd, zU2, t << 1);
368
369 // Cross butterflies ("Ba" => "A?") to make them look like:
370 // BBBB*??? BBAA*??? AAAA*??? AAAA*???
371 // AAAA???? or AA?????? or ??????aa or ????aaaa
372 for (; i >= last_zero_cross_bfly; i--, p -= skip)
373 {
374 // (b0, ?) -> (2*b0, ?) = (2*a0, ?)
375 pmf_add (p, p, M, mod);
376 }
377
378 // Cross butterflies ("BA" => "A?") to make them look like:
379 // AAAA*??? AAAA*??? AAAA*??? AAAA*???
380 // ???????? or ???????? or ??????aa or ????aaaa
381 for (; i >= 0; i--, p -= skip)
382 {
383 // (b0, 2*a1) -> (2*b0 - 2*a1, ?) = (2*a0, ?)
384 pmf_add (p, p, M, mod);
385 pmf_sub (p, p + half, M, mod);
386 }
387 }
388 else
389 {
390 // The input looks like one of these:
391 // CCCCCCCC CCCCCCCC
392 // AAAAaaaa (fwd == 1) or CCCAAAaa
393
394 // Transform first row (no truncation necessary) to make them look like:
395 // BBBBBBBB BBBBBBBB
396 // AAAAaaaa (fwd == 1) or CCCAAAaa
397 pmfvec_ifft_basecase (op, t << 1);
398
399 long i = U - 1;
400 ulong r = M >> op->lgK;
401 ulong s = t + r * i;
402 pmf_t p = op->data + skip * i;
403
404 long last_zero_cross_bfly = z - U;
405 long last_cross_bfly = n - U;
406
407 // Cross butterflies ("Ba" => "AB") to make them look like:
408 // BBBBAAAA BBBBBBAA
409 // AAAABBBB (fwd == 1) or CCCAAABB
410 for (; i >= last_zero_cross_bfly; i--, s -= r, p -= skip)
411 {
412 // (b0, ?) -> (2*b0, w*b0) = (2*a0, b1)
413 pmf_set (p + half, p, M);
414 pmf_rotate (p + half, s);
415 pmf_add (p, p, M, mod);
416 }
417
418 // Cross butterflies ("BA" => "AB") to make them look like:
419 // AAAAAAAA BBBAAAAA
420 // BBBBBBBB (fwd == 1) or CCCBBBBB
421 for (; i >= last_cross_bfly; i--, s -= r, p -= skip)
422 {
423 // (b0, 2*a1) -> (2*(b0-a1), w*(b0-2*a1)) = (2*a0, b1)
424 pmf_sub (p + half, p, M, mod);
425 pmf_sub (p, p + half, M, mod);
426 pmf_rotate (p + half, M + s);
427 }
428
429 // Transform second row to make them look like:
430 // AAAAAAAA BBBAAAAA
431 // *??????? (fwd == 1) or BBB*????
432 op->data += half;
433 pmfvec_ifft_dc (op, n - U, fwd, U, t << 1);
434 op->data -= half;
435
436 // Inverse butterflies ("BB" => "AA") to make them look like:
437 // AAAAAAAA AAAAAAAA
438 // *??????? (fwd == 1) or AAA*????
439 for (; i >= 0; i--, s -= r, p -= skip)
440 {
441 // (b0, b1) -> (b0 + w*b1, b0 - w*b1) = (2*a0, 2*a1)
442 pmf_rotate (p + half, M - s);
443 pmf_bfly (p + half, p, M, mod);
444 }
445 }
446
447 // pop back to full size
448 op->K <<= 1;
449 op->lgK++;
450 }
451
452
453
454 void
pmfvec_ifft_huge(pmfvec_t op,unsigned lgT,ulong n,int fwd,ulong z,ulong t)455 pmfvec_ifft_huge (pmfvec_t op, unsigned lgT, ulong n, int fwd, ulong z,
456 ulong t)
457 {
458 ZNP_ASSERT (op->lgK <= op->lgM + 1);
459 ZNP_ASSERT (t * op->K < 2 * op->M);
460 ZNP_ASSERT (z >= 1 && z <= op->K);
461 ZNP_ASSERT (n + fwd >= 1 && n + fwd <= op->K);
462 ZNP_ASSERT (n <= z);
463 ZNP_ASSERT (lgT > 0 && lgT < op->lgK);
464
465 unsigned lgK = op->lgK;
466 unsigned lgU = lgK - lgT;
467
468 ulong K = op->K;
469 ulong T = 1UL << lgT;
470 ulong U = 1UL << lgU;
471
472 ptrdiff_t skip = op->skip;
473 ptrdiff_t skip_U = skip << lgU;
474
475 ulong* data = op->data;
476
477 // Write n = U * nT + nU, where 0 <= nU < U
478 ulong nU = n & (U - 1);
479 ulong nT = n >> lgU;
480
481 // Write z = U * zT + zU, where 0 <= zU < U
482 ulong zU = z & (U - 1);
483 ulong zT = z >> lgU;
484 ulong zU2 = zT ? U : zU;
485
486 ulong mU1 = ZNP_MIN (zU, nU);
487 ulong mU2 = ZNP_MAX (zU, nU);
488
489 int fwd2 = nU || fwd;
490
491 ulong r = op->M >> (lgK - 1);
492 ulong s, i;
493 ulong tT = t << lgT;
494
495 // where:
496 // symbols in the following diagrams:
497 // A = fully untransformed coefficient (one of the a_i)
498 // B = intermediate coefficient
499 // C = fully transformed coefficient (one of the b_k)
500 // ? = garbage that we don't care about
501 // * = the "forward" C coefficient, or "?" if no forward coefficient
502 // requested
503
504 // The input looks something like this:
505 //
506 // CCCCCCCC
507 // CCCCCCCC
508 // CCCAAAAA
509 // AAAAAAAA
510 //
511 // (we won't bother marking in the locations of zeroes on the diagrams)
512 //
513 // The horizontal transforms convert between B and C.
514 // The vertical transforms convert between A and B.
515
516 // First do row transforms for complete rows, to make it look like:
517 // BBBBBBBB
518 // BBBBBBBB
519 // CCCAAAAA
520 // AAAAAAAA
521 op->lgK = lgU;
522 op->K = U;
523 for (i = 0; i < nT; i++, op->data += skip_U)
524 pmfvec_ifft (op, U, 0, U, tT);
525
526 // Column transforms for the rightmost columns, to obtain
527 // BBBAAAAA
528 // BBBAAAAA
529 // CCCBBBBB
530 // AAA?????
531 op->lgK = lgT;
532 op->K = T;
533 op->skip = skip_U;
534
535 for (i = nU, op->data = data + (skip * nU), s = t + (r * nU); i < mU2;
536 i++, op->data += skip, s += r)
537 {
538 pmfvec_ifft (op, nT, fwd2, zT + 1, s);
539 }
540 for (; i < zU2; i++, op->data += skip, s += r)
541 pmfvec_ifft (op, nT, fwd2, zT, s);
542
543 // If there is still a partial row to deal with....
544 if (fwd2)
545 {
546 // Transform the partial row to obtain
547 // BBBAAAAA
548 // BBBAAAAA
549 // BBB*????
550 // AAA?????
551 op->data = data + nT * skip_U;
552 op->lgK = lgU;
553 op->K = U;
554 op->skip = skip;
555 pmfvec_ifft (op, nU, fwd, zU2, tT);
556
557 // Column transforms for the leftmost columns, to obtain
558 // AAAAAAAA
559 // AAAAAAAA
560 // AAA*????
561 // ????????
562 op->lgK = lgT;
563 op->K = T;
564 op->skip = skip_U;
565
566 for (i = 0, op->data = data, s = t; i < mU1;
567 i++, op->data += skip, s += r)
568 {
569 pmfvec_ifft (op, nT + 1, 0, zT + 1, s);
570 }
571 for (; i < nU; i++, op->data += skip, s += r)
572 pmfvec_ifft (op, nT + 1, 0, zT, s);
573 }
574
575 // restore parameters
576 op->lgK = lgK;
577 op->K = K;
578 op->skip = skip;
579 op->data = data;
580 }
581
582
583
584 void
pmfvec_ifft(pmfvec_t op,ulong n,int fwd,ulong z,ulong t)585 pmfvec_ifft (pmfvec_t op, ulong n, int fwd, ulong z, ulong t)
586 {
587 ZNP_ASSERT (op->lgK <= op->lgM + 1);
588 ZNP_ASSERT (t * op->K < 2 * op->M);
589 ZNP_ASSERT (z <= op->K);
590 ZNP_ASSERT (n <= z);
591 ZNP_ASSERT (n + fwd <= op->K);
592
593 if (op->K <= 2 || 2 * op->K * op->M * sizeof(ulong) <= ZNP_CACHE_SIZE)
594 {
595 // IFFT is pretty small; use use divide-and-conquer
596 pmfvec_ifft_dc (op, n, fwd, z, t);
597 }
598 else
599 {
600 // IFFT is relatively big; use factoring algorithm instead
601 pmfvec_ifft_huge (op, op->lgK / 2, n, fwd, z, t);
602 }
603 }
604
605
606
607 /* ============================================================================
608
609 transposed FFT routines
610
611 ============================================================================ */
612
613
614 void
pmfvec_tpfft_basecase(pmfvec_t op,ulong t)615 pmfvec_tpfft_basecase (pmfvec_t op, ulong t)
616 {
617 ZNP_ASSERT (op->lgK <= op->lgM + 1);
618 ZNP_ASSERT (t * op->K < 2 * op->M);
619
620 if (op->lgK == 0)
621 return;
622
623 ulong M = op->M;
624 const zn_mod_struct* mod = op->mod;
625
626 ulong s, r = M;
627 ulong r_last = M >> (op->lgK - 1);
628 t <<= (op->lgK - 1);
629 ptrdiff_t half = op->skip;
630 ulong* end = op->data + (op->skip << op->lgK);
631 ulong* p;
632 ulong* start;
633
634 for (; r >= r_last; r >>= 1, half <<= 1, t >>= 1)
635 for (start = op->data, s = t; s < M; s += r, start += op->skip)
636 for (p = start; p < end; p += 2 * half)
637 {
638 pmf_rotate (p + half, M + s);
639 pmf_bfly (p + half, p, M, mod);
640 }
641 }
642
643
644
645 void
pmfvec_tpfft_dc(pmfvec_t op,ulong n,ulong z,ulong t)646 pmfvec_tpfft_dc (pmfvec_t op, ulong n, ulong z, ulong t)
647 {
648 ZNP_ASSERT (op->lgK <= op->lgM + 1);
649 ZNP_ASSERT (t * op->K < 2 * op->M);
650 ZNP_ASSERT (n >= 1 && n <= op->K);
651 ZNP_ASSERT (z >= 1 && z <= op->K);
652
653 if (op->K == 1)
654 return;
655
656 if (n == op->K && z == op->K)
657 {
658 pmfvec_tpfft_basecase (op, t);
659 return;
660 }
661
662 const zn_mod_struct* mod = op->mod;
663
664 op->lgK--;
665 op->K >>= 1;
666
667 long i;
668 ulong M = op->M;
669 ulong U = op->K;
670 ulong* p = op->data;
671 ptrdiff_t skip = op->skip;
672 ptrdiff_t half = skip << op->lgK;
673 ulong z2 = ZNP_MIN (z, U);
674
675 if (n <= U)
676 {
677 pmfvec_tpfft_dc (op, n, z2, t << 1);
678
679 for (i = 0; i < (long)(z - U); i++, p += skip)
680 pmf_set (p + half, p, M);
681 }
682 else
683 {
684 op->data += half;
685 pmfvec_tpfft_dc (op, n - U, z2, t << 1);
686 op->data -= half;
687 pmfvec_tpfft_dc (op, U, z2, t << 1);
688
689 ulong s = t;
690 ulong r = M >> op->lgK;
691
692 for (i = 0; i < (long)(z - U); i++, p += skip, s += r)
693 {
694 pmf_rotate (p + half, M + s);
695 pmf_bfly (p + half, p, M, mod);
696 }
697
698 for (; i < z2; i++, p += skip, s += r)
699 {
700 pmf_rotate (p + half, s);
701 pmf_add (p, p + half, M, mod);
702 }
703 }
704
705 op->K <<= 1;
706 op->lgK++;
707 }
708
709
710
711 void
pmfvec_tpfft_huge(pmfvec_t op,unsigned lgT,ulong n,ulong z,ulong t)712 pmfvec_tpfft_huge (pmfvec_t op, unsigned lgT, ulong n, ulong z, ulong t)
713 {
714 ZNP_ASSERT (op->lgK <= op->lgM + 1);
715 ZNP_ASSERT (t * op->K < 2 * op->M);
716 ZNP_ASSERT (lgT > 0 && lgT < op->lgK);
717 ZNP_ASSERT (n >= 1 && n <= op->K);
718 ZNP_ASSERT (z >= 1 && z <= op->K);
719
720 unsigned lgK = op->lgK;
721 unsigned lgU = lgK - lgT;
722
723 ulong K = op->K;
724 ulong T = 1UL << lgT;
725 ulong U = 1UL << lgU;
726
727 ptrdiff_t skip = op->skip;
728 ptrdiff_t skip_U = skip << lgU;
729
730 ulong* data = op->data;
731
732 ulong nU = n & (U - 1);
733 ulong nT = n >> lgU;
734
735 ulong nT_ceil = nT + (nU > 0);
736
737 ulong zT = z >> lgU;
738 ulong zU = z & (U - 1);
739 ulong zU2 = zT ? U : zU;
740
741 ulong r = op->M >> (lgK - 1);
742 ulong s, i;
743
744 op->K = U;
745 op->lgK = lgU;
746 t <<= lgT;
747
748 for (i = 0; i < nT; i++, op->data += skip_U)
749 pmfvec_tpfft (op, U, zU2, t);
750
751 if (nU)
752 pmfvec_tpfft (op, nU, zU2, t);
753
754 op->data = data;
755 op->K = T;
756 op->lgK = lgT;
757 op->skip = skip_U;
758 t >>= lgT;
759
760 for (i = 0, s = t; i < zU; i++, op->data += skip, s += r)
761 pmfvec_tpfft (op, nT_ceil, zT + 1, s);
762
763 for (; i < zU2; i++, op->data += skip, s += r)
764 pmfvec_tpfft (op, nT_ceil, zT, s);
765
766 op->data = data;
767 op->skip = skip;
768 op->K = K;
769 op->lgK = lgK;
770 }
771
772
773
774 void
pmfvec_tpfft(pmfvec_t op,ulong n,ulong z,ulong t)775 pmfvec_tpfft (pmfvec_t op, ulong n, ulong z, ulong t)
776 {
777 ZNP_ASSERT (op->lgK <= op->lgM + 1);
778 ZNP_ASSERT (t * op->K < 2 * op->M);
779 ZNP_ASSERT (n >= 1 && n <= op->K);
780 ZNP_ASSERT (z >= 1 && z <= op->K);
781
782 if (op->K <= 2 || 2 * op->K * op->M * sizeof (ulong) <= ZNP_CACHE_SIZE)
783 {
784 pmfvec_tpfft_dc (op, n, z, t);
785 }
786 else
787 {
788 pmfvec_tpfft_huge (op, op->lgK / 2, n, z, t);
789 }
790 }
791
792
793
794 /* ============================================================================
795
796 transposed inverse IFFT routines
797
798 ============================================================================ */
799
800 void
pmfvec_tpifft_basecase(pmfvec_t op,ulong t)801 pmfvec_tpifft_basecase (pmfvec_t op, ulong t)
802 {
803 ZNP_ASSERT (op->lgK <= op->lgM + 1);
804 ZNP_ASSERT (t * op->K < 2*op->M);
805
806 if (op->lgK == 0)
807 return;
808
809 ulong M = op->M;
810 const zn_mod_struct* mod = op->mod;
811
812 ulong s, r = M >> (op->lgK - 1);
813 ptrdiff_t half = op->skip << (op->lgK - 1);
814 ulong* end = op->data + (op->skip << op->lgK);
815 ulong* p;
816 ulong* start;
817
818 for (; r <= M; r <<= 1, half >>= 1, t <<= 1)
819 for (start = op->data, s = t; s < M; s += r, start += op->skip)
820 for (p = start; p < end; p += 2 * half)
821 {
822 pmf_bfly (p, p + half, M, mod);
823 pmf_rotate (p + half, M - s);
824 }
825 }
826
827
828
829 void
pmfvec_tpifft_dc(pmfvec_t op,ulong n,int fwd,ulong z,ulong t)830 pmfvec_tpifft_dc (pmfvec_t op, ulong n, int fwd, ulong z, ulong t)
831 {
832 ZNP_ASSERT (op->lgK <= op->lgM + 1);
833 ZNP_ASSERT (t * op->K < 2 * op->M);
834 ZNP_ASSERT (z >= 1 && z <= op->K);
835 ZNP_ASSERT (n + fwd >= 1 && n + fwd <= op->K);
836 ZNP_ASSERT (n <= z);
837
838 if (op->K == 1)
839 return;
840
841 if (n == op->K)
842 {
843 pmfvec_tpifft_basecase (op, t);
844 return;
845 }
846
847 const zn_mod_struct* mod = op->mod;
848
849 op->lgK--;
850 op->K >>= 1;
851
852 long i;
853 ulong M = op->M;
854 ulong U = op->K;
855 pmf_t p = op->data;
856 ptrdiff_t skip = op->skip;
857 ptrdiff_t half = skip << op->lgK;
858
859 if (n + fwd <= U)
860 {
861 long zU2 = ZNP_MIN (z, U);
862 long last_zero_fwd_bfly = ZNP_MAX (z - zU2, n);
863 long last_zero_cross_bfly = ZNP_MIN (z - zU2, n);
864
865 for (i = 0; i < last_zero_cross_bfly; i++, p += skip)
866 {
867 pmf_set (p + half, p, M);
868 pmf_rotate (p + half, M);
869 pmf_add (p, p, M, mod);
870 }
871
872 for (; i < n; i++, p += skip)
873 pmf_add (p, p, M, mod);
874
875 pmfvec_tpifft_dc (op, n, fwd, zU2, t << 1);
876
877 for (; i < last_zero_fwd_bfly; i++, p += skip)
878 {
879 pmf_divby2 (p, M, mod);
880 pmf_set (p + half, p, M);
881 }
882
883 for (; i < zU2; i++, p += skip)
884 pmf_divby2 (p, M, mod);
885 }
886 else
887 {
888 long last_zero_cross_bfly = z - U;
889 long last_cross_bfly = n - U;
890 ulong r = M >> op->lgK;
891 ulong s = t;
892
893 for (i = 0; i < last_cross_bfly; i++, s += r, p += skip)
894 {
895 pmf_bfly (p, p + half, M, mod);
896 pmf_rotate (p + half, M - s);
897 }
898
899 op->data += half;
900 pmfvec_tpifft_dc (op, n - U, fwd, U, t << 1);
901 op->data -= half;
902
903 for (; i < last_zero_cross_bfly; i++, s += r, p += skip)
904 {
905 pmf_rotate (p + half, M + s);
906 pmf_sub (p + half, p, M, mod);
907 pmf_sub (p, p + half, M, mod);
908 }
909
910 for (; i < U; i++, s += r, p += skip)
911 {
912 pmf_add (p, p, M, mod);
913 pmf_rotate (p + half, s);
914 pmf_add (p, p + half, M, mod);
915 }
916
917 pmfvec_tpifft_basecase (op, t << 1);
918 }
919
920 op->K <<= 1;
921 op->lgK++;
922 }
923
924
925
926 void
pmfvec_tpifft_huge(pmfvec_t op,unsigned lgT,ulong n,int fwd,ulong z,ulong t)927 pmfvec_tpifft_huge (pmfvec_t op, unsigned lgT, ulong n, int fwd, ulong z,
928 ulong t)
929 {
930 ZNP_ASSERT (op->lgK <= op->lgM + 1);
931 ZNP_ASSERT (t * op->K < 2 * op->M);
932 ZNP_ASSERT (z >= 1 && z <= op->K);
933 ZNP_ASSERT (n + fwd >= 1 && n + fwd <= op->K);
934 ZNP_ASSERT (n <= z);
935 ZNP_ASSERT (lgT > 0 && lgT < op->lgK);
936
937 unsigned lgK = op->lgK;
938 unsigned lgU = lgK - lgT;
939
940 ulong K = op->K;
941 ulong T = 1UL << lgT;
942 ulong U = 1UL << lgU;
943
944 ptrdiff_t skip = op->skip;
945 ptrdiff_t skip_U = skip << lgU;
946
947 ulong* data = op->data;
948
949 ulong nU = n & (U - 1);
950 ulong nT = n >> lgU;
951
952 ulong zU = z & (U - 1);
953 ulong zT = z >> lgU;
954 ulong zU2 = zT ? U : zU;
955
956 ulong mU1 = ZNP_MIN (zU, nU);
957 ulong mU2 = ZNP_MAX (zU, nU);
958
959 int fwd2 = nU || fwd;
960
961 ulong r = op->M >> (lgK - 1);
962 ulong s, i;
963 ulong tT = t << lgT;
964
965 if (fwd2)
966 {
967 op->lgK = lgT;
968 op->K = T;
969 op->skip = skip_U;
970
971 for (i = 0, op->data = data, s = t; i < mU1;
972 i++, op->data += skip, s += r)
973 {
974 pmfvec_tpifft (op, nT + 1, 0, zT + 1, s);
975 }
976 for (; i < nU; i++, op->data += skip, s += r)
977 pmfvec_tpifft (op, nT + 1, 0, zT, s);
978
979 op->data = data + nT * skip_U;
980 op->lgK = lgU;
981 op->K = U;
982 op->skip = skip;
983 pmfvec_tpifft (op, nU, fwd, zU2, tT);
984 }
985
986 op->lgK = lgT;
987 op->K = T;
988 op->skip = skip_U;
989
990 for (i = nU, op->data = data + (skip * nU), s = t + (r * nU); i < mU2;
991 i++, op->data += skip, s += r)
992 {
993 pmfvec_tpifft (op, nT, fwd2, zT + 1, s);
994 }
995 for (; i < zU2; i++, op->data += skip, s += r)
996 pmfvec_tpifft (op, nT, fwd2, zT, s);
997
998 op->data = data;
999 op->skip = skip;
1000 op->lgK = lgU;
1001 op->K = U;
1002 for (i = 0; i < nT; i++, op->data += skip_U)
1003 pmfvec_tpifft (op, U, 0, U, tT);
1004
1005 op->data = data;
1006 op->lgK = lgK;
1007 op->K = K;
1008 }
1009
1010
1011
1012 void
pmfvec_tpifft(pmfvec_t op,ulong n,int fwd,ulong z,ulong t)1013 pmfvec_tpifft (pmfvec_t op, ulong n, int fwd, ulong z, ulong t)
1014 {
1015 ZNP_ASSERT (op->lgK <= op->lgM + 1);
1016 ZNP_ASSERT (t * op->K < 2 * op->M);
1017 ZNP_ASSERT (z >= 1 && z <= op->K);
1018 ZNP_ASSERT (n + fwd >= 1 && n + fwd <= op->K);
1019 ZNP_ASSERT (n <= z);
1020
1021 if (op->K <= 2 || 2 * op->K * op->M * sizeof (ulong) <= ZNP_CACHE_SIZE)
1022 {
1023 pmfvec_tpifft_dc (op, n, fwd, z, t);
1024 }
1025 else
1026 {
1027 pmfvec_tpifft_huge (op, op->lgK / 2, n, fwd, z, t);
1028 }
1029 }
1030
1031
1032
1033 // end of file ****************************************************************
1034