1 /*
2    pmfvec_fft.c:  FFT/IFFT and transposed FFT/IFFT routines for pmfvec_t
3 
4    Copyright (C) 2007, 2008, David Harvey
5 
6    This file is part of the zn_poly library (version 0.9).
7 
8    This program is free software: you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation, either version 2 of the License, or
11    (at your option) version 3 of the License.
12 
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with this program.  If not, see <http://www.gnu.org/licenses/>.
20 
21 */
22 
23 #include <stdio.h>
24 #include "zn_poly_internal.h"
25 
26 
27 /* ============================================================================
28 
29      FFT routines
30 
31 ============================================================================ */
32 
33 
34 void
pmfvec_fft_basecase(pmfvec_t op,ulong t)35 pmfvec_fft_basecase (pmfvec_t op, ulong t)
36 {
37    ZNP_ASSERT (op->lgK <= op->lgM + 1);
38    ZNP_ASSERT (t * op->K < 2 * op->M);
39 
40    if (op->lgK == 0)
41       return;
42 
43    // just plain butterfly loop
44 
45    ulong M = op->M;
46    const zn_mod_struct* mod = op->mod;
47 
48    ulong s, r = M >> (op->lgK - 1);
49    ptrdiff_t half = op->skip << (op->lgK - 1);
50    ulong* end = op->data + (op->skip << op->lgK);
51    ulong* p;
52    ulong* start;
53 
54    for (; r <= M; r <<= 1, half >>= 1, t <<= 1)
55    for (start = op->data, s = t; s < M; s += r, start += op->skip)
56    for (p = start; p < end; p += 2 * half)
57    {
58       pmf_bfly (p, p + half, M, mod);
59       pmf_rotate (p + half, M + s);
60    }
61 }
62 
63 
64 
65 void
pmfvec_fft_dc(pmfvec_t op,ulong n,ulong z,ulong t)66 pmfvec_fft_dc (pmfvec_t op, ulong n, ulong z, ulong t)
67 {
68    ZNP_ASSERT (op->lgK <= op->lgM + 1);
69    ZNP_ASSERT (t * op->K < 2 * op->M);
70    ZNP_ASSERT (n >= 1 && n <= op->K);
71    ZNP_ASSERT (z >= 1 && z <= op->K);
72 
73    if (op->K == 1)
74       return;
75 
76    if (n == op->K  &&  z == op->K)
77    {
78       // No truncation requested; use iterative version
79       pmfvec_fft_basecase (op, t);
80       return;
81    }
82 
83    const zn_mod_struct* mod = op->mod;
84 
85    // We treat the input as two rows and U = K/2 columns, in row-major order.
86 
87    // descend to first row (first half of op)
88    op->lgK--;
89    op->K >>= 1;
90 
91    long i;
92    ulong M = op->M;
93    ulong U = op->K;
94    ulong* p = op->data;
95    ptrdiff_t skip = op->skip;
96    ptrdiff_t half = skip << op->lgK;
97    ulong z2 = ZNP_MIN (z, U);
98 
99    if (n <= U)
100    {
101       // Only need the first output of the first layer of butterflies.
102       for (i = 0; i < (long)(z - U); i++, p += skip)
103          pmf_add (p, p + half, M, mod);
104 
105       // Recurse into top row
106       pmfvec_fft_dc (op, n, z2, t << 1);
107    }
108    else
109    {
110       // Need both outputs from the first layer of butterflies.
111       ulong s = t;
112       ulong r = M >> op->lgK;
113 
114       for (i = 0; i < (long)(z - U); i++, p += skip, s += r)
115       {
116          pmf_bfly (p, p + half, M, mod);
117          pmf_rotate (p + half, M + s);
118       }
119 
120       // Butterflies where second input is zero
121       for (; i < z2; i++, p += skip, s += r)
122       {
123          pmf_set (p + half, p, M);
124          pmf_rotate (p + half, s);
125       }
126 
127       // Recurse into top row...
128       pmfvec_fft_dc (op, U, z2, t << 1);
129 
130       // ... and recurse into bottom row
131       op->data += half;
132       pmfvec_fft_dc (op, n - U, z2, t << 1);
133       op->data -= half;
134    }
135 
136    // pop back to whole transform
137    op->K <<= 1;
138    op->lgK++;
139 }
140 
141 
142 
143 /*
144    As described above, this splits the length K transform into T = 2^lgT rows
145    by U = 2^lgU columns, where K = U * T.
146 
147    Must have 0 < lgT < lgK.
148 */
149 void
pmfvec_fft_huge(pmfvec_t op,unsigned lgT,ulong n,ulong z,ulong t)150 pmfvec_fft_huge (pmfvec_t op, unsigned lgT, ulong n, ulong z, ulong t)
151 {
152    ZNP_ASSERT (op->lgK <= op->lgM + 1);
153    ZNP_ASSERT (t * op->K < 2 * op->M);
154    ZNP_ASSERT (lgT > 0  &&  lgT < op->lgK);
155    ZNP_ASSERT (n >= 1 && n <= op->K);
156    ZNP_ASSERT (z >= 1 && z <= op->K);
157 
158    unsigned lgK = op->lgK;
159    unsigned lgU = lgK - lgT;
160 
161    ulong K = op->K;
162    ulong T = 1UL << lgT;
163    ulong U = 1UL << lgU;
164 
165    ptrdiff_t skip = op->skip;
166    ptrdiff_t skip_U = skip << lgU;
167 
168    ulong* data = op->data;
169 
170    // We need n output coefficients, starting from the top-left, in row-major
171    // order.
172 
173    // Write n = U * nT + nU, where 0 <= nU < U
174    ulong nU = n & (U - 1);
175    ulong nT = n >> lgU;
176 
177    // nT_ceil = number of rows of output, including the last partial row
178    ulong nT_ceil = nT + (nU > 0);
179 
180    // Write z = U * zT + zU, where 0 <= zU < U
181    ulong zT = z >> lgU;
182    ulong zU = z & (U - 1);
183    ulong zU2 = zT ? U : zU;
184 
185    ulong r = op->M >> (lgK - 1);
186    ulong s, i;
187 
188    // --------------- FFTs along columns
189 
190    op->K = T;
191    op->lgK = lgT;
192    op->skip = skip_U;
193 
194    // First handle the columns with zT + 1 input coefficients.
195    for (i = 0, s = t; i < zU; i++, op->data += skip, s += r)
196       pmfvec_fft (op, nT_ceil, zT + 1, s);
197 
198    // Handle the remaining columns, which only have zT input coefficients.
199    for (; i < zU2; i++, op->data += skip, s += r)
200       pmfvec_fft (op, nT_ceil, zT, s);
201 
202    // --------------- FFTs along rows
203 
204    op->data = data;
205    op->K = U;
206    op->lgK = lgU;
207    op->skip = skip;
208    t <<= lgT;
209 
210    // Handle the first nT rows.
211    for (i = 0; i < nT; i++, op->data += skip_U)
212       pmfvec_fft (op, U, zU2, t);
213 
214    // For the last row, we only need the first nU outputs:
215    if (nU)
216       pmfvec_fft (op, nU, zU2, t);
217 
218    // --------------- restore parameters
219 
220    op->data = data;
221    op->K = K;
222    op->lgK = lgK;
223 }
224 
225 
226 
227 void
pmfvec_fft(pmfvec_t op,ulong n,ulong z,ulong t)228 pmfvec_fft (pmfvec_t op, ulong n, ulong z, ulong t)
229 {
230    ZNP_ASSERT (op->lgK <= op->lgM + 1);
231    ZNP_ASSERT (t * op->K < 2 * op->M);
232    ZNP_ASSERT (n >= 1 && n <= op->K);
233    ZNP_ASSERT (z >= 1 && z <= op->K);
234 
235    if (op->K <= 2  ||  2 * op->K * op->M * sizeof (ulong) <= ZNP_CACHE_SIZE)
236    {
237       // FFT is pretty small; use divide-and-conquer
238       pmfvec_fft_dc (op, n, z, t);
239    }
240    else
241    {
242       // FFT is relatively big; use factoring algorithm instead
243       pmfvec_fft_huge (op, op->lgK / 2, n, z, t);
244    }
245 }
246 
247 
248 
249 /* ============================================================================
250 
251      inverse FFT routines
252 
253 ============================================================================ */
254 
255 void
pmfvec_ifft_basecase(pmfvec_t op,ulong t)256 pmfvec_ifft_basecase (pmfvec_t op, ulong t)
257 {
258    ZNP_ASSERT (op->lgK <= op->lgM + 1);
259    ZNP_ASSERT (t * op->K < 2 * op->M);
260 
261    if (op->lgK == 0)
262       return;
263 
264    // just plain butterfly loop
265 
266    ulong M = op->M;
267    const zn_mod_struct* mod = op->mod;
268 
269    ulong s, r = M;
270    ulong r_last = M >> (op->lgK - 1);
271    t <<= (op->lgK - 1);
272    ptrdiff_t half = op->skip;
273    ulong* end = op->data + (op->skip << op->lgK);
274    ulong* p;
275    ulong* start;
276 
277    for (; r >= r_last; r >>= 1, half <<= 1, t >>= 1)
278    for (start = op->data, s = t; s < M; s += r, start += op->skip)
279    for (p = start; p < end; p += 2 * half)
280    {
281       pmf_rotate (p + half, M - s);
282       pmf_bfly (p + half, p, M, mod);
283    }
284 }
285 
286 
287 
288 void
pmfvec_ifft_dc(pmfvec_t op,ulong n,int fwd,ulong z,ulong t)289 pmfvec_ifft_dc (pmfvec_t op, ulong n, int fwd, ulong z, ulong t)
290 {
291    ZNP_ASSERT (op->lgK <= op->lgM + 1);
292    ZNP_ASSERT (t * op->K < 2 * op->M);
293    ZNP_ASSERT (z >= 1 && z <= op->K);
294    ZNP_ASSERT (n + fwd >= 1 && n + fwd <= op->K);
295    ZNP_ASSERT (n <= z);
296 
297    if (op->K == 1)
298       return;
299 
300    if (n == op->K)
301    {
302       // No truncation requested; use iterative version
303       pmfvec_ifft_basecase (op, t);
304       return;
305    }
306 
307    const zn_mod_struct* mod = op->mod;
308 
309    // We treat the input as two rows and U = K / 2 columns, in row-major order.
310 
311    // descend to first row (first half of op)
312    op->K >>= 1;
313    op->lgK--;
314 
315    ulong M = op->M;
316    ulong U = op->K;
317    ptrdiff_t skip = op->skip;
318    ptrdiff_t half = skip << op->lgK;
319 
320    // symbols in the following diagrams:
321    // A = fully untransformed coefficient (one of the a_i)
322    // B = intermediate coefficient
323    // C = fully transformed coefficient (one of the b_k)
324    // a, b, c = same as three above, but implied zero
325    // ? = garbage that we don't care about
326    // * = the "forward" C coefficient, or "?" if no forward coefficient
327    //     requested
328 
329    // The horizontal transforms convert between B and C.
330    // The vertical butterflies convert between A and B.
331 
332    if (n + fwd <= U)
333    {
334       // The input could look like one of the following:
335       // CCCCAAAA      CCCCAAAA      CCCCAAaa      CCCCaaaa
336       // AAAAAAaa  or  AAaaaaaa  or  aaaaaaaa  or  aaaaaaaa
337 
338       long zU2 = ZNP_MIN (z, U);
339       long last_zero_fwd_bfly = ZNP_MAX (z - zU2, n);
340       long last_zero_cross_bfly = ZNP_MIN (z - zU2, n);
341 
342       long i = zU2 - 1;
343       pmf_t p = op->data + skip * i;
344 
345       // First some forward butterflies ("Aa" => "B?") to make them look like:
346       // CCCCAABB      CCCCBBBB      CCCCBBaa      CCCCaaaa
347       // AAAAAA??  or  AAaa????  or  aaaa??aa  or  aaaaaaaa
348       for (; i >= last_zero_fwd_bfly; i--, p -= skip)
349       {
350          // (2*a0, ?) -> (a0, ?)   = (b0, ?)
351          pmf_divby2 (p, M, mod);
352       }
353 
354       // Then some forward butterflies ("AA" => "B?") to make them look like:
355       // CCCCBBBB      CCCCBBBB      CCCCBBaa      CCCCaaaa
356       // AAAA????  or  AAaa????  or  aaaa??aa  or  aaaaaaaa
357       for (; i >= (long) n; i--, p -= skip)
358       {
359          // (2*a0, 2*a1) -> (a0 + a1, ?)   = (b0, ?)
360          pmf_add (p, p + half, M, mod);
361          pmf_divby2 (p, M, mod);
362       }
363 
364       // Transform the first row to make them look like:
365       // BBBB*???      BBBB*???      BBBB*???      BBBB*???
366       // AAAA????  or  AAaa????  or  aaaa??aa  or  aaaaaaaa
367       pmfvec_ifft_dc (op, n, fwd, zU2, t << 1);
368 
369       // Cross butterflies ("Ba" => "A?") to make them look like:
370       // BBBB*???      BBAA*???      AAAA*???      AAAA*???
371       // AAAA????  or  AA??????  or  ??????aa  or  ????aaaa
372       for (; i >= last_zero_cross_bfly; i--, p -= skip)
373       {
374          // (b0, ?) -> (2*b0, ?)    = (2*a0, ?)
375          pmf_add (p, p, M, mod);
376       }
377 
378       // Cross butterflies ("BA" => "A?") to make them look like:
379       // AAAA*???      AAAA*???      AAAA*???      AAAA*???
380       // ????????  or  ????????  or  ??????aa  or  ????aaaa
381       for (; i >= 0; i--, p -= skip)
382       {
383          // (b0, 2*a1) -> (2*b0 - 2*a1, ?)     = (2*a0, ?)
384          pmf_add (p, p, M, mod);
385          pmf_sub (p, p + half, M, mod);
386       }
387    }
388    else
389    {
390       // The input looks like one of these:
391       // CCCCCCCC                 CCCCCCCC
392       // AAAAaaaa (fwd == 1)  or  CCCAAAaa
393 
394       // Transform first row (no truncation necessary) to make them look like:
395       // BBBBBBBB                 BBBBBBBB
396       // AAAAaaaa (fwd == 1)  or  CCCAAAaa
397       pmfvec_ifft_basecase (op, t << 1);
398 
399       long i = U - 1;
400       ulong r = M >> op->lgK;
401       ulong s = t + r * i;
402       pmf_t p = op->data + skip * i;
403 
404       long last_zero_cross_bfly = z - U;
405       long last_cross_bfly = n - U;
406 
407       // Cross butterflies ("Ba" => "AB") to make them look like:
408       // BBBBAAAA                 BBBBBBAA
409       // AAAABBBB (fwd == 1)  or  CCCAAABB
410       for (; i >= last_zero_cross_bfly; i--, s -= r, p -= skip)
411       {
412          // (b0, ?) -> (2*b0, w*b0)     = (2*a0, b1)
413          pmf_set (p + half, p, M);
414          pmf_rotate (p + half, s);
415          pmf_add (p, p, M, mod);
416       }
417 
418       // Cross butterflies ("BA" => "AB") to make them look like:
419       // AAAAAAAA                 BBBAAAAA
420       // BBBBBBBB (fwd == 1)  or  CCCBBBBB
421       for (; i >= last_cross_bfly; i--, s -= r, p -= skip)
422       {
423          // (b0, 2*a1) -> (2*(b0-a1), w*(b0-2*a1))    = (2*a0, b1)
424          pmf_sub (p + half, p, M, mod);
425          pmf_sub (p, p + half, M, mod);
426          pmf_rotate (p + half, M + s);
427       }
428 
429       // Transform second row to make them look like:
430       // AAAAAAAA                 BBBAAAAA
431       // *??????? (fwd == 1)  or  BBB*????
432       op->data += half;
433       pmfvec_ifft_dc (op, n - U, fwd, U, t << 1);
434       op->data -= half;
435 
436       // Inverse butterflies ("BB" => "AA") to make them look like:
437       // AAAAAAAA                 AAAAAAAA
438       // *??????? (fwd == 1)  or  AAA*????
439       for (; i >= 0; i--, s -= r, p -= skip)
440       {
441          // (b0, b1) -> (b0 + w*b1, b0 - w*b1)    = (2*a0, 2*a1)
442          pmf_rotate (p + half, M - s);
443          pmf_bfly (p + half, p, M, mod);
444       }
445    }
446 
447    // pop back to full size
448    op->K <<= 1;
449    op->lgK++;
450 }
451 
452 
453 
454 void
pmfvec_ifft_huge(pmfvec_t op,unsigned lgT,ulong n,int fwd,ulong z,ulong t)455 pmfvec_ifft_huge (pmfvec_t op, unsigned lgT, ulong n, int fwd, ulong z,
456                   ulong t)
457 {
458    ZNP_ASSERT (op->lgK <= op->lgM + 1);
459    ZNP_ASSERT (t * op->K < 2 * op->M);
460    ZNP_ASSERT (z >= 1 && z <= op->K);
461    ZNP_ASSERT (n + fwd >= 1 && n + fwd <= op->K);
462    ZNP_ASSERT (n <= z);
463    ZNP_ASSERT (lgT > 0  &&  lgT < op->lgK);
464 
465    unsigned lgK = op->lgK;
466    unsigned lgU = lgK - lgT;
467 
468    ulong K = op->K;
469    ulong T = 1UL << lgT;
470    ulong U = 1UL << lgU;
471 
472    ptrdiff_t skip = op->skip;
473    ptrdiff_t skip_U = skip << lgU;
474 
475    ulong* data = op->data;
476 
477    // Write n = U * nT + nU, where 0 <= nU < U
478    ulong nU = n & (U - 1);
479    ulong nT = n >> lgU;
480 
481    // Write z = U * zT + zU, where 0 <= zU < U
482    ulong zU = z & (U - 1);
483    ulong zT = z >> lgU;
484    ulong zU2 = zT ? U : zU;
485 
486    ulong mU1 = ZNP_MIN (zU, nU);
487    ulong mU2 = ZNP_MAX (zU, nU);
488 
489    int fwd2 = nU || fwd;
490 
491    ulong r = op->M >> (lgK - 1);
492    ulong s, i;
493    ulong tT = t << lgT;
494 
495    // where:
496    // symbols in the following diagrams:
497    // A = fully untransformed coefficient (one of the a_i)
498    // B = intermediate coefficient
499    // C = fully transformed coefficient (one of the b_k)
500    // ? = garbage that we don't care about
501    // * = the "forward" C coefficient, or "?" if no forward coefficient
502    //     requested
503 
504    // The input looks something like this:
505    //
506    // CCCCCCCC
507    // CCCCCCCC
508    // CCCAAAAA
509    // AAAAAAAA
510    //
511    // (we won't bother marking in the locations of zeroes on the diagrams)
512    //
513    // The horizontal transforms convert between B and C.
514    // The vertical transforms convert between A and B.
515 
516    // First do row transforms for complete rows, to make it look like:
517    // BBBBBBBB
518    // BBBBBBBB
519    // CCCAAAAA
520    // AAAAAAAA
521    op->lgK = lgU;
522    op->K = U;
523    for (i = 0; i < nT; i++, op->data += skip_U)
524       pmfvec_ifft (op, U, 0, U, tT);
525 
526    // Column transforms for the rightmost columns, to obtain
527    // BBBAAAAA
528    // BBBAAAAA
529    // CCCBBBBB
530    // AAA?????
531    op->lgK = lgT;
532    op->K = T;
533    op->skip = skip_U;
534 
535    for (i = nU, op->data = data + (skip * nU), s = t + (r * nU); i < mU2;
536         i++, op->data += skip, s += r)
537    {
538       pmfvec_ifft (op, nT, fwd2, zT + 1, s);
539    }
540    for (; i < zU2; i++, op->data += skip, s += r)
541       pmfvec_ifft (op, nT, fwd2, zT, s);
542 
543    // If there is still a partial row to deal with....
544    if (fwd2)
545    {
546       // Transform the partial row to obtain
547       // BBBAAAAA
548       // BBBAAAAA
549       // BBB*????
550       // AAA?????
551       op->data = data + nT * skip_U;
552       op->lgK = lgU;
553       op->K = U;
554       op->skip = skip;
555       pmfvec_ifft (op, nU, fwd, zU2, tT);
556 
557       // Column transforms for the leftmost columns, to obtain
558       // AAAAAAAA
559       // AAAAAAAA
560       // AAA*????
561       // ????????
562       op->lgK = lgT;
563       op->K = T;
564       op->skip = skip_U;
565 
566       for (i = 0, op->data = data, s = t; i < mU1;
567            i++, op->data += skip, s += r)
568       {
569          pmfvec_ifft (op, nT + 1, 0, zT + 1, s);
570       }
571       for (; i < nU; i++, op->data += skip, s += r)
572          pmfvec_ifft (op, nT + 1, 0, zT, s);
573    }
574 
575    // restore parameters
576    op->lgK = lgK;
577    op->K = K;
578    op->skip = skip;
579    op->data = data;
580 }
581 
582 
583 
584 void
pmfvec_ifft(pmfvec_t op,ulong n,int fwd,ulong z,ulong t)585 pmfvec_ifft (pmfvec_t op, ulong n, int fwd, ulong z, ulong t)
586 {
587    ZNP_ASSERT (op->lgK <= op->lgM + 1);
588    ZNP_ASSERT (t * op->K < 2 * op->M);
589    ZNP_ASSERT (z <= op->K);
590    ZNP_ASSERT (n <= z);
591    ZNP_ASSERT (n + fwd <= op->K);
592 
593    if (op->K <= 2  ||  2 * op->K * op->M * sizeof(ulong) <= ZNP_CACHE_SIZE)
594    {
595       // IFFT is pretty small; use use divide-and-conquer
596       pmfvec_ifft_dc (op, n, fwd, z, t);
597    }
598    else
599    {
600       // IFFT is relatively big; use factoring algorithm instead
601       pmfvec_ifft_huge (op, op->lgK / 2, n, fwd, z, t);
602    }
603 }
604 
605 
606 
607 /* ============================================================================
608 
609      transposed FFT routines
610 
611 ============================================================================ */
612 
613 
614 void
pmfvec_tpfft_basecase(pmfvec_t op,ulong t)615 pmfvec_tpfft_basecase (pmfvec_t op, ulong t)
616 {
617    ZNP_ASSERT (op->lgK <= op->lgM + 1);
618    ZNP_ASSERT (t * op->K < 2 * op->M);
619 
620    if (op->lgK == 0)
621       return;
622 
623    ulong M = op->M;
624    const zn_mod_struct* mod = op->mod;
625 
626    ulong s, r = M;
627    ulong r_last = M >> (op->lgK - 1);
628    t <<= (op->lgK - 1);
629    ptrdiff_t half = op->skip;
630    ulong* end = op->data + (op->skip << op->lgK);
631    ulong* p;
632    ulong* start;
633 
634    for (; r >= r_last; r >>= 1, half <<= 1, t >>= 1)
635    for (start = op->data, s = t; s < M; s += r, start += op->skip)
636    for (p = start; p < end; p += 2 * half)
637    {
638       pmf_rotate (p + half, M + s);
639       pmf_bfly (p + half, p, M, mod);
640    }
641 }
642 
643 
644 
645 void
pmfvec_tpfft_dc(pmfvec_t op,ulong n,ulong z,ulong t)646 pmfvec_tpfft_dc (pmfvec_t op, ulong n, ulong z, ulong t)
647 {
648    ZNP_ASSERT (op->lgK <= op->lgM + 1);
649    ZNP_ASSERT (t * op->K < 2 * op->M);
650    ZNP_ASSERT (n >= 1 && n <= op->K);
651    ZNP_ASSERT (z >= 1 && z <= op->K);
652 
653    if (op->K == 1)
654       return;
655 
656    if (n == op->K  &&  z == op->K)
657    {
658       pmfvec_tpfft_basecase (op, t);
659       return;
660    }
661 
662    const zn_mod_struct* mod = op->mod;
663 
664    op->lgK--;
665    op->K >>= 1;
666 
667    long i;
668    ulong M = op->M;
669    ulong U = op->K;
670    ulong* p = op->data;
671    ptrdiff_t skip = op->skip;
672    ptrdiff_t half = skip << op->lgK;
673    ulong z2 = ZNP_MIN (z, U);
674 
675    if (n <= U)
676    {
677       pmfvec_tpfft_dc (op, n, z2, t << 1);
678 
679       for (i = 0; i < (long)(z - U); i++, p += skip)
680          pmf_set (p + half, p, M);
681    }
682    else
683    {
684       op->data += half;
685       pmfvec_tpfft_dc (op, n - U, z2, t << 1);
686       op->data -= half;
687       pmfvec_tpfft_dc (op, U, z2, t << 1);
688 
689       ulong s = t;
690       ulong r = M >> op->lgK;
691 
692       for (i = 0; i < (long)(z - U); i++, p += skip, s += r)
693       {
694          pmf_rotate (p + half, M + s);
695          pmf_bfly (p + half, p, M, mod);
696       }
697 
698       for (; i < z2; i++, p += skip, s += r)
699       {
700          pmf_rotate (p + half, s);
701          pmf_add (p, p + half, M, mod);
702       }
703    }
704 
705    op->K <<= 1;
706    op->lgK++;
707 }
708 
709 
710 
711 void
pmfvec_tpfft_huge(pmfvec_t op,unsigned lgT,ulong n,ulong z,ulong t)712 pmfvec_tpfft_huge (pmfvec_t op, unsigned lgT, ulong n, ulong z, ulong t)
713 {
714    ZNP_ASSERT (op->lgK <= op->lgM + 1);
715    ZNP_ASSERT (t * op->K < 2 * op->M);
716    ZNP_ASSERT (lgT > 0  &&  lgT < op->lgK);
717    ZNP_ASSERT (n >= 1 && n <= op->K);
718    ZNP_ASSERT (z >= 1 && z <= op->K);
719 
720    unsigned lgK = op->lgK;
721    unsigned lgU = lgK - lgT;
722 
723    ulong K = op->K;
724    ulong T = 1UL << lgT;
725    ulong U = 1UL << lgU;
726 
727    ptrdiff_t skip = op->skip;
728    ptrdiff_t skip_U = skip << lgU;
729 
730    ulong* data = op->data;
731 
732    ulong nU = n & (U - 1);
733    ulong nT = n >> lgU;
734 
735    ulong nT_ceil = nT + (nU > 0);
736 
737    ulong zT = z >> lgU;
738    ulong zU = z & (U - 1);
739    ulong zU2 = zT ? U : zU;
740 
741    ulong r = op->M >> (lgK - 1);
742    ulong s, i;
743 
744    op->K = U;
745    op->lgK = lgU;
746    t <<= lgT;
747 
748    for (i = 0; i < nT; i++, op->data += skip_U)
749       pmfvec_tpfft (op, U, zU2, t);
750 
751    if (nU)
752       pmfvec_tpfft (op, nU, zU2, t);
753 
754    op->data = data;
755    op->K = T;
756    op->lgK = lgT;
757    op->skip = skip_U;
758    t >>= lgT;
759 
760    for (i = 0, s = t; i < zU; i++, op->data += skip, s += r)
761       pmfvec_tpfft (op, nT_ceil, zT + 1, s);
762 
763    for (; i < zU2; i++, op->data += skip, s += r)
764       pmfvec_tpfft (op, nT_ceil, zT, s);
765 
766    op->data = data;
767    op->skip = skip;
768    op->K = K;
769    op->lgK = lgK;
770 }
771 
772 
773 
774 void
pmfvec_tpfft(pmfvec_t op,ulong n,ulong z,ulong t)775 pmfvec_tpfft (pmfvec_t op, ulong n, ulong z, ulong t)
776 {
777    ZNP_ASSERT (op->lgK <= op->lgM + 1);
778    ZNP_ASSERT (t * op->K < 2 * op->M);
779    ZNP_ASSERT (n >= 1 && n <= op->K);
780    ZNP_ASSERT (z >= 1 && z <= op->K);
781 
782    if (op->K <= 2  ||  2 * op->K * op->M * sizeof (ulong) <= ZNP_CACHE_SIZE)
783    {
784       pmfvec_tpfft_dc (op, n, z, t);
785    }
786    else
787    {
788       pmfvec_tpfft_huge (op, op->lgK / 2, n, z, t);
789    }
790 }
791 
792 
793 
794 /* ============================================================================
795 
796      transposed inverse IFFT routines
797 
798 ============================================================================ */
799 
800 void
pmfvec_tpifft_basecase(pmfvec_t op,ulong t)801 pmfvec_tpifft_basecase (pmfvec_t op, ulong t)
802 {
803    ZNP_ASSERT (op->lgK <= op->lgM + 1);
804    ZNP_ASSERT (t * op->K < 2*op->M);
805 
806    if (op->lgK == 0)
807       return;
808 
809    ulong M = op->M;
810    const zn_mod_struct* mod = op->mod;
811 
812    ulong s, r = M >> (op->lgK - 1);
813    ptrdiff_t half = op->skip << (op->lgK - 1);
814    ulong* end = op->data + (op->skip << op->lgK);
815    ulong* p;
816    ulong* start;
817 
818    for (; r <= M; r <<= 1, half >>= 1, t <<= 1)
819    for (start = op->data, s = t; s < M; s += r, start += op->skip)
820    for (p = start; p < end; p += 2 * half)
821    {
822       pmf_bfly (p, p + half, M, mod);
823       pmf_rotate (p + half, M - s);
824    }
825 }
826 
827 
828 
829 void
pmfvec_tpifft_dc(pmfvec_t op,ulong n,int fwd,ulong z,ulong t)830 pmfvec_tpifft_dc (pmfvec_t op, ulong n, int fwd, ulong z, ulong t)
831 {
832    ZNP_ASSERT (op->lgK <= op->lgM + 1);
833    ZNP_ASSERT (t * op->K < 2 * op->M);
834    ZNP_ASSERT (z >= 1 && z <= op->K);
835    ZNP_ASSERT (n + fwd >= 1 && n + fwd <= op->K);
836    ZNP_ASSERT (n <= z);
837 
838    if (op->K == 1)
839       return;
840 
841    if (n == op->K)
842    {
843       pmfvec_tpifft_basecase (op, t);
844       return;
845    }
846 
847    const zn_mod_struct* mod = op->mod;
848 
849    op->lgK--;
850    op->K >>= 1;
851 
852    long i;
853    ulong M = op->M;
854    ulong U = op->K;
855    pmf_t p = op->data;
856    ptrdiff_t skip = op->skip;
857    ptrdiff_t half = skip << op->lgK;
858 
859    if (n + fwd <= U)
860    {
861       long zU2 = ZNP_MIN (z, U);
862       long last_zero_fwd_bfly = ZNP_MAX (z - zU2, n);
863       long last_zero_cross_bfly = ZNP_MIN (z - zU2, n);
864 
865       for (i = 0; i < last_zero_cross_bfly; i++, p += skip)
866       {
867          pmf_set (p + half, p, M);
868          pmf_rotate (p + half, M);
869          pmf_add (p, p, M, mod);
870       }
871 
872       for (; i < n; i++, p += skip)
873          pmf_add (p, p, M, mod);
874 
875       pmfvec_tpifft_dc (op, n, fwd, zU2, t << 1);
876 
877       for (; i < last_zero_fwd_bfly; i++, p += skip)
878       {
879          pmf_divby2 (p, M, mod);
880          pmf_set (p + half, p, M);
881       }
882 
883       for (; i < zU2; i++, p += skip)
884          pmf_divby2 (p, M, mod);
885    }
886    else
887    {
888       long last_zero_cross_bfly = z - U;
889       long last_cross_bfly = n - U;
890       ulong r = M >> op->lgK;
891       ulong s = t;
892 
893       for (i = 0; i < last_cross_bfly; i++, s += r, p += skip)
894       {
895          pmf_bfly (p, p + half, M, mod);
896          pmf_rotate (p + half, M - s);
897       }
898 
899       op->data += half;
900       pmfvec_tpifft_dc (op, n - U, fwd, U, t << 1);
901       op->data -= half;
902 
903       for (; i < last_zero_cross_bfly; i++, s += r, p += skip)
904       {
905          pmf_rotate (p + half, M + s);
906          pmf_sub (p + half, p, M, mod);
907          pmf_sub (p, p + half, M, mod);
908       }
909 
910       for (; i < U; i++, s += r, p += skip)
911       {
912          pmf_add (p, p, M, mod);
913          pmf_rotate (p + half, s);
914          pmf_add (p, p + half, M, mod);
915       }
916 
917       pmfvec_tpifft_basecase (op, t << 1);
918    }
919 
920    op->K <<= 1;
921    op->lgK++;
922 }
923 
924 
925 
926 void
pmfvec_tpifft_huge(pmfvec_t op,unsigned lgT,ulong n,int fwd,ulong z,ulong t)927 pmfvec_tpifft_huge (pmfvec_t op, unsigned lgT, ulong n, int fwd, ulong z,
928                     ulong t)
929 {
930    ZNP_ASSERT (op->lgK <= op->lgM + 1);
931    ZNP_ASSERT (t * op->K < 2 * op->M);
932    ZNP_ASSERT (z >= 1 && z <= op->K);
933    ZNP_ASSERT (n + fwd >= 1 && n + fwd <= op->K);
934    ZNP_ASSERT (n <= z);
935    ZNP_ASSERT (lgT > 0  &&  lgT < op->lgK);
936 
937    unsigned lgK = op->lgK;
938    unsigned lgU = lgK - lgT;
939 
940    ulong K = op->K;
941    ulong T = 1UL << lgT;
942    ulong U = 1UL << lgU;
943 
944    ptrdiff_t skip = op->skip;
945    ptrdiff_t skip_U = skip << lgU;
946 
947    ulong* data = op->data;
948 
949    ulong nU = n & (U - 1);
950    ulong nT = n >> lgU;
951 
952    ulong zU = z & (U - 1);
953    ulong zT = z >> lgU;
954    ulong zU2 = zT ? U : zU;
955 
956    ulong mU1 = ZNP_MIN (zU, nU);
957    ulong mU2 = ZNP_MAX (zU, nU);
958 
959    int fwd2 = nU || fwd;
960 
961    ulong r = op->M >> (lgK - 1);
962    ulong s, i;
963    ulong tT = t << lgT;
964 
965    if (fwd2)
966    {
967       op->lgK = lgT;
968       op->K = T;
969       op->skip = skip_U;
970 
971       for (i = 0, op->data = data, s = t; i < mU1;
972            i++, op->data += skip, s += r)
973       {
974          pmfvec_tpifft (op, nT + 1, 0, zT + 1, s);
975       }
976       for (; i < nU; i++, op->data += skip, s += r)
977          pmfvec_tpifft (op, nT + 1, 0, zT, s);
978 
979       op->data = data + nT * skip_U;
980       op->lgK = lgU;
981       op->K = U;
982       op->skip = skip;
983       pmfvec_tpifft (op, nU, fwd, zU2, tT);
984    }
985 
986    op->lgK = lgT;
987    op->K = T;
988    op->skip = skip_U;
989 
990    for (i = nU, op->data = data + (skip * nU), s = t + (r * nU); i < mU2;
991         i++, op->data += skip, s += r)
992    {
993       pmfvec_tpifft (op, nT, fwd2, zT + 1, s);
994    }
995    for (; i < zU2; i++, op->data += skip, s += r)
996       pmfvec_tpifft (op, nT, fwd2, zT, s);
997 
998    op->data = data;
999    op->skip = skip;
1000    op->lgK = lgU;
1001    op->K = U;
1002    for (i = 0; i < nT; i++, op->data += skip_U)
1003       pmfvec_tpifft (op, U, 0, U, tT);
1004 
1005    op->data = data;
1006    op->lgK = lgK;
1007    op->K = K;
1008 }
1009 
1010 
1011 
1012 void
pmfvec_tpifft(pmfvec_t op,ulong n,int fwd,ulong z,ulong t)1013 pmfvec_tpifft (pmfvec_t op, ulong n, int fwd, ulong z, ulong t)
1014 {
1015    ZNP_ASSERT (op->lgK <= op->lgM + 1);
1016    ZNP_ASSERT (t * op->K < 2 * op->M);
1017    ZNP_ASSERT (z >= 1 && z <= op->K);
1018    ZNP_ASSERT (n + fwd >= 1 && n + fwd <= op->K);
1019    ZNP_ASSERT (n <= z);
1020 
1021    if (op->K <= 2  ||  2 * op->K * op->M * sizeof (ulong) <= ZNP_CACHE_SIZE)
1022    {
1023       pmfvec_tpifft_dc (op, n, fwd, z, t);
1024    }
1025    else
1026    {
1027       pmfvec_tpifft_huge (op, op->lgK / 2, n, fwd, z, t);
1028    }
1029 }
1030 
1031 
1032 
1033 // end of file ****************************************************************
1034