1 /*
2   This file is for the Gao-Mateer FFT
3   sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf
4 */
5 
6 #include "fft.h"
7 
8 #include "transpose.h"
9 #include "vec256.h"
10 
11 #include <stdint.h>
12 
13 /* input: in, polynomial in bitsliced form */
14 /* output: in, result of applying the radix conversions on in */
15 static void radix_conversions(vec128 *in) {
16     int i, j, k;
17     vec128 t;
18     uint64_t v0, v1;
19 
20     const vec128 mask[5][2] = {
21         {
22             PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888),
23             PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444)
24         },
25         {
26             PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0),
27             PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030)
28         },
29         {
30             PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000),
31             PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00)
32         },
33         {
34             PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000),
35             PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000)
36         },
37         {
38             PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000),
39             PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000)
40         }
41     };
42 
43     const vec128 s[5][GFBITS] = {
44 #include "scalars_2x.inc"
45     };
46 
47     //
48 
49     for (j = 0; j <= 5; j++) {
50         for (i = 0; i < GFBITS; i++) {
51             v1 = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[i], 1);
52             v1 ^= v1 >> 32;
53             v0 = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[i], 0);
54             v0 ^= v1 << 32;
55             in[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v0, v1);
56         }
57 
58         for (i = 0; i < GFBITS; i++) {
59             for (k = 4; k >= j; k--) {
60                 t = PQCLEAN_MCELIECE6688128_AVX_vec128_and(in[i], mask[k][0]);
61                 t = PQCLEAN_MCELIECE6688128_AVX_vec128_srl_2x(t, 1 << k);
62                 in[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(in[i], t);
63 
64                 t = PQCLEAN_MCELIECE6688128_AVX_vec128_and(in[i], mask[k][1]);
65                 t = PQCLEAN_MCELIECE6688128_AVX_vec128_srl_2x(t, 1 << k);
66                 in[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(in[i], t);
67             }
68         }
69 
70         if (j < 5) {
71             PQCLEAN_MCELIECE6688128_AVX_vec128_mul(in, in, s[j]);    // scaling
72         }
73     }
74 }
75 
76 /* input: in, result of applying the radix conversions to the input polynomial */
77 /* output: out, evaluation results (by applying the FFT butterflies) */
78 static void butterflies(vec256 out[][ GFBITS ], vec128 *in) {
79     int i, j, k, s, b;
80 
81     vec128 tmp[ GFBITS ];
82     vec256 tmp0[ GFBITS ];
83     vec256 tmp1[ GFBITS ];
84     vec128 t[ GFBITS ];
85 
86     union {
87         vec128 v[8][  GFBITS + 1    ];
88         vec256 V[8][ (GFBITS + 1) / 2 ];
89     } pre;
90 
91     union {
92         vec128 v[64][ 2 ];
93         vec256 V[64];
94     } buf;
95 
96     uint64_t v0, v1;
97 
98     const vec256 consts[ 33 ][ GFBITS ] = {
99 #include "consts.inc"
100     };
101 
102     const vec256 powers[ 32 ][ GFBITS ] = {
103 #include "powers.inc"
104     };
105 
106     uint64_t consts_ptr = 2;
107 
108     const unsigned char reversal[64] = {
109         0, 32, 16, 48,  8, 40, 24, 56,
110         4, 36, 20, 52, 12, 44, 28, 60,
111         2, 34, 18, 50, 10, 42, 26, 58,
112         6, 38, 22, 54, 14, 46, 30, 62,
113         1, 33, 17, 49,  9, 41, 25, 57,
114         5, 37, 21, 53, 13, 45, 29, 61,
115         3, 35, 19, 51, 11, 43, 27, 59,
116         7, 39, 23, 55, 15, 47, 31, 63
117     };
118 
119     const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0};
120 
121     // boradcast
122 
123     for (j = 0; j < GFBITS; j++) {
124         t[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(in[j], in[j]);
125     }
126 
127     for (i = 0; i < 8; i += 2) {
128         for (j = 0; j < GFBITS; j++) {
129             v0 = (beta[i + 0] >> j) & 1;
130             v0 = -v0;
131             v1 = (beta[i + 1] >> j) & 1;
132             v1 = -v1;
133 
134             tmp[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v0, v1);
135         }
136 
137         PQCLEAN_MCELIECE6688128_AVX_vec128_mul(tmp, t, tmp);
138 
139         for (j = 0; j < GFBITS; j++) {
140             pre.v[i + 0][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(tmp[j], tmp[j]);
141             pre.v[i + 1][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(tmp[j], tmp[j]);
142         }
143     }
144 
145     for (i = 0; i < GFBITS; i += 2) {
146         if (i != GFBITS - 1) {
147             buf.v[0][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]);
148         }
149         buf.v[0][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]);
150 
151         buf.V[1] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]);
152         buf.V[16] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]);
153         buf.V[3] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]);
154         buf.V[48] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]);
155         buf.V[49] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]);
156         buf.V[2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]);
157         buf.V[51] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]);
158         buf.V[6] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]);
159         buf.V[50] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]);
160         buf.V[7] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]);
161         buf.V[54] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]);
162         buf.V[5] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]);
163         buf.V[55] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]);
164         buf.V[53] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]);
165         buf.V[4] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]);
166         buf.V[52] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]);
167         buf.V[12] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]);
168         buf.V[60] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]);
169         buf.V[13] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]);
170         buf.V[61] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]);
171         buf.V[15] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]);
172         buf.V[63] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]);
173         buf.V[14] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]);
174         buf.V[62] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]);
175         buf.V[10] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]);
176         buf.V[58] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]);
177         buf.V[11] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]);
178         buf.V[59] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]);
179         buf.V[9] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]);
180         buf.V[57] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]);
181         buf.V[56] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]);
182         buf.V[8] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]);
183         buf.V[40] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]);
184         buf.V[24] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]);
185         buf.V[41] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]);
186         buf.V[25] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]);
187         buf.V[43] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]);
188         buf.V[27] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]);
189         buf.V[42] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]);
190         buf.V[26] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]);
191         buf.V[46] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]);
192         buf.V[30] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]);
193         buf.V[47] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]);
194         buf.V[31] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]);
195         buf.V[45] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]);
196         buf.V[29] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]);
197         buf.V[44] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]);
198         buf.V[28] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]);
199         buf.V[36] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]);
200         buf.V[20] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]);
201         buf.V[37] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]);
202         buf.V[21] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]);
203         buf.V[39] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]);
204         buf.V[23] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]);
205         buf.V[38] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]);
206         buf.V[22] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]);
207         buf.V[34] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]);
208         buf.V[18] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]);
209         buf.V[35] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]);
210         buf.V[19] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]);
211         buf.V[33] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]);
212         buf.V[17] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]);
213         buf.V[32] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]);
214 
215 
216         // transpose
217 
218         PQCLEAN_MCELIECE6688128_AVX_transpose_64x256_sp(buf.V);
219 
220         for (j = 0; j < 32; j++) {
221             if (i != GFBITS - 1) {
222                 out[j][i + 1] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]);
223             }
224             out[j][i + 0] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]);
225         }
226     }
227 
228     // butterflies
229 
230     for (k = 0; k < 32; k += 2) {
231         for (b = 0; b < GFBITS; b++) {
232             tmp0[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]);
233         }
234         for (b = 0; b < GFBITS; b++) {
235             tmp1[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]);
236         }
237 
238         PQCLEAN_MCELIECE6688128_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]);
239 
240         for (b = 0; b < GFBITS; b++) {
241             out[k][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low (tmp0[b], tmp1[b]);
242         }
243         for (b = 0; b < GFBITS; b++) {
244             out[k + 1][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high (tmp0[b], tmp1[b]);
245         }
246     }
247 
248     for (i = 0; i <= 4; i++) {
249         s = 1 << i;
250 
251         for (j = 0; j < 32; j += 2 * s) {
252             for (k = j; k < j + s; k++) {
253                 PQCLEAN_MCELIECE6688128_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]);
254             }
255         }
256 
257         consts_ptr += (1 << i);
258     }
259 
260     // adding the part contributed by x^128
261 
262     for (i = 0; i <     32; i++) {
263         for (b = 0; b < GFBITS; b++) {
264             out[i][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(out[i][b], powers[i][b]);
265         }
266     }
267 }
268 
269 /* input: in, polynomial in bitsliced form */
270 /* output: out, bitsliced results of evaluating in all the field elements */
271 void PQCLEAN_MCELIECE6688128_AVX_fft(vec256 out[][GFBITS], vec128 *in) {
272     radix_conversions(in);
273     butterflies(out, in);
274 }
275 
276