1 /*
2   This file is for the Gao-Mateer FFT
3   sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf
4 */
5 
6 #include "fft.h"
7 
8 #include "vec.h"
9 
10 /* input: in, polynomial in bitsliced form */
11 /* output: in, result of applying the radix conversions on in */
radix_conversions(uint64_t * in)12 static void radix_conversions(uint64_t *in) {
13     int i, j, k;
14 
15     const uint64_t mask[5][2] = {
16         {0x8888888888888888, 0x4444444444444444},
17         {0xC0C0C0C0C0C0C0C0, 0x3030303030303030},
18         {0xF000F000F000F000, 0x0F000F000F000F00},
19         {0xFF000000FF000000, 0x00FF000000FF0000},
20         {0xFFFF000000000000, 0x0000FFFF00000000}
21     };
22 
23     const uint64_t s[5][GFBITS] = {
24 #include "scalars.inc"
25     };
26 
27     //
28 
29     for (j = 0; j <= 4; j++) {
30         for (i = 0; i < GFBITS; i++) {
31             for (k = 4; k >= j; k--) {
32                 in[i] ^= (in[i] & mask[k][0]) >> (1 << k);
33                 in[i] ^= (in[i] & mask[k][1]) >> (1 << k);
34             }
35         }
36 
37         PQCLEAN_MCELIECE348864F_AVX_vec_mul(in, in, s[j]); // scaling
38     }
39 }
40 
41 /* input: in, result of applying the radix conversions to the input polynomial */
42 /* output: out, evaluation results (by applying the FFT butterflies) */
butterflies(vec256 out[][GFBITS],const uint64_t * in)43 static void butterflies(vec256 out[][ GFBITS ], const uint64_t *in) {
44     int i, j, k, s, b;
45 
46     uint64_t t0, t1, t2, t3;
47 
48     const vec256 consts[ 17 ][ GFBITS ] = {
49 #include "consts.inc"
50     };
51 
52     uint64_t consts_ptr = 0;
53 
54     const unsigned char reversal[64] = {
55         0, 32, 16, 48,  8, 40, 24, 56,
56         4, 36, 20, 52, 12, 44, 28, 60,
57         2, 34, 18, 50, 10, 42, 26, 58,
58         6, 38, 22, 54, 14, 46, 30, 62,
59         1, 33, 17, 49,  9, 41, 25, 57,
60         5, 37, 21, 53, 13, 45, 29, 61,
61         3, 35, 19, 51, 11, 43, 27, 59,
62         7, 39, 23, 55, 15, 47, 31, 63
63     };
64 
65     // boradcast
66 
67     vec256 tmp256[ GFBITS ];
68     vec256 x[ GFBITS ], y[ GFBITS ];
69 
70     for (j = 0; j < 64; j += 8) {
71         for (i = 0; i < GFBITS; i++) {
72             t0 = (in[i] >> reversal[j + 0]) & 1;
73             t0 = -t0;
74             t1 = (in[i] >> reversal[j + 2]) & 1;
75             t1 = -t1;
76             t2 = (in[i] >> reversal[j + 4]) & 1;
77             t2 = -t2;
78             t3 = (in[i] >> reversal[j + 6]) & 1;
79             t3 = -t3;
80 
81             out[j / 4 + 0][i] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(t0, t1, t2, t3);
82 
83             t0 = (in[i] >> reversal[j + 1]) & 1;
84             t0 = -t0;
85             t1 = (in[i] >> reversal[j + 3]) & 1;
86             t1 = -t1;
87             t2 = (in[i] >> reversal[j + 5]) & 1;
88             t2 = -t2;
89             t3 = (in[i] >> reversal[j + 7]) & 1;
90             t3 = -t3;
91 
92             out[j / 4 + 1][i] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(t0, t1, t2, t3);
93         }
94     }
95 
96     //
97 
98     for (i = 0; i < 16; i += 2) {
99         PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, out[i + 1], consts[ 0 ]);
100 
101         for (b = 0; b < GFBITS; b++) {
102             out[i + 0][b] ^= tmp256[b];
103         }
104         for (b = 0; b < GFBITS; b++) {
105             out[i + 1][b] ^= out[i + 0][b];
106         }
107     }
108 
109     for (i = 0; i < 16; i += 2) {
110         for (b = 0; b < GFBITS; b++) {
111             x[b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low_2x(out[i + 0][b], out[i + 1][b]);
112         }
113         for (b = 0; b < GFBITS; b++) {
114             y[b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high_2x(out[i + 0][b], out[i + 1][b]);
115         }
116 
117         PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, y, consts[ 1 ]);
118 
119         for (b = 0; b < GFBITS; b++) {
120             x[b] ^= tmp256[b];
121         }
122         for (b = 0; b < GFBITS; b++) {
123             y[b] ^= x[b];
124         }
125 
126         for (b = 0; b < GFBITS; b++) {
127             out[i + 0][b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low(x[b], y[b]);
128         }
129         for (b = 0; b < GFBITS; b++) {
130             out[i + 1][b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high(x[b], y[b]);
131         }
132     }
133 
134     consts_ptr = 2;
135 
136     for (i = 0; i <= 3; i++) {
137         s = 1 << i;
138 
139         for (j = 0; j < 16; j += 2 * s) {
140             for (k = j; k < j + s; k++) {
141                 PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, out[k + s], consts[ consts_ptr + (k - j) ]);
142 
143                 for (b = 0; b < GFBITS; b++) {
144                     out[k][b] ^= tmp256[b];
145                 }
146                 for (b = 0; b < GFBITS; b++) {
147                     out[k + s][b] ^= out[k][b];
148                 }
149             }
150         }
151 
152         consts_ptr += s;
153     }
154 
155     // adding the part contributed by x^64
156 
157     vec256 powers[16][GFBITS] = {
158 #include "powers.inc"
159     };
160 
161     for (i = 0; i < 16; i++) {
162         for (b = 0; b < GFBITS; b++) {
163             out[i][b] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(out[i][b], powers[i][b]);
164         }
165     }
166 }
167 
PQCLEAN_MCELIECE348864F_AVX_fft(vec256 out[][GFBITS],uint64_t * in)168 void PQCLEAN_MCELIECE348864F_AVX_fft(vec256 out[][ GFBITS ], uint64_t *in) {
169     radix_conversions(in);
170     butterflies(out, in);
171 }
172 
173