1 /*
2   This file is for transpose of the Gao-Mateer FFT
3   Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c
4 */
5 
6 #include "fft_tr.h"
7 
8 #include "transpose.h"
9 
10 #include <stdint.h>
11 
radix_conversions_tr(vec256 * in)12 static void radix_conversions_tr(vec256 *in) {
13     int i, j, k;
14     vec256 t;
15     uint64_t v[4];
16 
17     const vec256 mask[6][2] = {
18         {
19             PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222),
20             PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444)
21         },
22         {
23             PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C),
24             PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030)
25         },
26         {
27             PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0),
28             PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00)
29         },
30         {
31             PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00),
32             PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000)
33         },
34         {
35             PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000),
36             PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000)
37         },
38         {
39             PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000),
40             PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF)
41         }
42     };
43 
44     const vec256 s[6][GFBITS] = {
45 #include "scalars_4x.inc"
46     };
47 
48     //
49 
50     for (j = 6; j >= 0; j--) {
51         if (j < 6) {
52             PQCLEAN_MCELIECE8192128F_AVX_vec256_mul(in, in, s[j]); // scaling
53         }
54 
55         for (k = j; k <= 4; k++) {
56             for (i = 0; i < GFBITS; i++) {
57                 t = PQCLEAN_MCELIECE8192128F_AVX_vec256_and(in[i], mask[k][0]);
58                 t = PQCLEAN_MCELIECE8192128F_AVX_vec256_sll_4x(t, 1 << k);
59                 in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(in[i], t);
60 
61                 t = PQCLEAN_MCELIECE8192128F_AVX_vec256_and(in[i], mask[k][1]);
62                 t = PQCLEAN_MCELIECE8192128F_AVX_vec256_sll_4x(t, 1 << k);
63                 in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(in[i], t);
64             }
65         }
66 
67         if (j <= 5) {
68             for (i = 0; i < GFBITS; i++) {
69                 v[0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 0);
70                 v[1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 1);
71                 v[2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 2);
72                 v[3] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 3);
73 
74                 v[1] ^= v[0] >> 32;
75                 v[1] ^= v[1] << 32;
76                 v[3] ^= v[2] >> 32;
77                 v[3] ^= v[3] << 32;
78 
79                 in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]);
80             }
81         }
82 
83         for (i = 0; i < GFBITS; i++) {
84             v[0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 0);
85             v[1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 1);
86             v[2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 2);
87             v[3] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 3);
88 
89             v[2] ^= v[1];
90             v[3] ^= v[2];
91 
92             in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]);
93         }
94 
95     }
96 }
97 
butterflies_tr(vec256 * out,vec256 in[][GFBITS])98 static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) {
99     int i, j, k, s, b;
100 
101     vec256 t0[ GFBITS ];
102     vec256 t1[ GFBITS ];
103     vec256 t;
104 
105     vec128 out128[ GFBITS ][ 2 ];
106     vec128 tmp[ GFBITS ];
107 
108     union {
109         vec128 v[6][  GFBITS + 1    ];
110         vec256 V[6][ (GFBITS + 1) / 2 ];
111     } pre;
112 
113     union {
114         vec128 v[64][ 2 ];
115         vec256 V[64];
116     } buf;
117 
118     const vec256 consts[ 33 ][ GFBITS ] = {
119 #include "consts.inc"
120     };
121 
122     uint64_t v[4];
123     uint64_t consts_ptr = 33;
124 
125     const unsigned char reversal[] = {
126         0, 32, 16, 48,  8, 40, 24, 56,
127         4, 36, 20, 52, 12, 44, 28, 60,
128         2, 34, 18, 50, 10, 42, 26, 58,
129         6, 38, 22, 54, 14, 46, 30, 62,
130         1, 33, 17, 49,  9, 41, 25, 57,
131         5, 37, 21, 53, 13, 45, 29, 61,
132         3, 35, 19, 51, 11, 43, 27, 59,
133         7, 39, 23, 55, 15, 47, 31, 63
134     };
135 
136     const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755};
137 
138     // butterflies
139 
140     for (i = 4; i >= 0; i--) {
141         s = 1 << i;
142         consts_ptr -= s;
143 
144         for (j = 0; j < 32; j += 2 * s) {
145             for (k = j; k < j + s; k++) {
146                 PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]);
147             }
148         }
149 
150     }
151 
152     for (k = 0; k < 32; k += 2) {
153         for (b = 0; b < GFBITS; b++) {
154             t0[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]);
155         }
156         for (b = 0; b < GFBITS; b++) {
157             t1[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]);
158         }
159 
160         PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(t0, t1, consts[1]);
161 
162         for (b = 0; b < GFBITS; b++) {
163             in[k][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low(t0[b], t1[b]);
164         }
165         for (b = 0; b < GFBITS; b++) {
166             in[k + 1][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high(t0[b], t1[b]);
167         }
168 
169         for (b = 0; b < GFBITS; b++) {
170             t0[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]);
171         }
172         for (b = 0; b < GFBITS; b++) {
173             t1[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]);
174         }
175 
176         PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(t0, t1, consts[0]);
177 
178         for (b = 0; b < GFBITS; b++) {
179             in[k + 0][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low_2x(t0[b], t1[b]);
180         }
181         for (b = 0; b < GFBITS; b++) {
182             in[k + 1][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high_2x(t0[b], t1[b]);
183         }
184     }
185 
186 
187     // boradcast
188 
189     for (i = 0; i < GFBITS; i += 2) {
190         // transpose
191 
192         for (k = 0; k < 32; k++) {
193             if (i != GFBITS - 1) {
194                 buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(in[ k ][i + 1], 0);
195                 buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(in[ k ][i + 1], 1);
196             }
197 
198             buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(in[ k ][i + 0], 0);
199             buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(in[ k ][i + 0], 1);
200         }
201 
202         PQCLEAN_MCELIECE8192128F_AVX_transpose_64x256_sp(buf.V);
203 
204         //
205 
206         pre.V[0][i / 2] = buf.V[32];
207         buf.V[33] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[33], buf.V[32]);
208         pre.V[1][i / 2] = buf.V[33];
209         buf.V[35] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[35], buf.V[33]);
210         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]);
211         buf.V[34] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[34], buf.V[35]);
212         pre.V[2][i / 2] = buf.V[34];
213         buf.V[38] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[38], buf.V[34]);
214         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]);
215         buf.V[39] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[39], buf.V[38]);
216         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]);
217         buf.V[37] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[37], buf.V[39]);
218         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]);
219         buf.V[36] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[36], buf.V[37]);
220         pre.V[3][i / 2] = buf.V[36];
221         buf.V[44] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[44], buf.V[36]);
222         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]);
223         buf.V[45] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[45], buf.V[44]);
224         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]);
225         buf.V[47] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[47], buf.V[45]);
226         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]);
227         buf.V[46] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[46], buf.V[47]);
228         pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]);
229         buf.V[42] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[42], buf.V[46]);
230         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]);
231         buf.V[43] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[43], buf.V[42]);
232         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]);
233         buf.V[41] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[41], buf.V[43]);
234         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]);
235         buf.V[40] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[40], buf.V[41]);
236         pre.V[4][i / 2] = buf.V[40];
237         buf.V[56] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[56], buf.V[40]);
238         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]);
239         buf.V[57] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[57], buf.V[56]);
240         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]);
241         buf.V[59] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[59], buf.V[57]);
242         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]);
243         buf.V[58] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[58], buf.V[59]);
244         pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]);
245         buf.V[62] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[62], buf.V[58]);
246         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]);
247         buf.V[63] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[63], buf.V[62]);
248         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]);
249         buf.V[61] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[61], buf.V[63]);
250         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]);
251         buf.V[60] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[60], buf.V[61]);
252         pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]);
253         buf.V[52] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[52], buf.V[60]);
254         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]);
255         buf.V[53] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[53], buf.V[52]);
256         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]);
257         buf.V[55] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[55], buf.V[53]);
258         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]);
259         buf.V[54] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[54], buf.V[55]);
260         pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]);
261         buf.V[50] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[50], buf.V[54]);
262         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]);
263         buf.V[51] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[51], buf.V[50]);
264         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]);
265         buf.V[49] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[49], buf.V[51]);
266         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]);
267         buf.V[48] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[48], buf.V[49]);
268         pre.V[5][i / 2] = buf.V[48];
269         buf.V[16] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[16], buf.V[48]);
270         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]);
271         buf.V[17] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[17], buf.V[16]);
272         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]);
273         buf.V[19] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[19], buf.V[17]);
274         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]);
275         buf.V[18] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[18], buf.V[19]);
276         pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]);
277         buf.V[22] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[22], buf.V[18]);
278         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]);
279         buf.V[23] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[23], buf.V[22]);
280         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]);
281         buf.V[21] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[21], buf.V[23]);
282         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]);
283         buf.V[20] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[20], buf.V[21]);
284         pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]);
285         buf.V[28] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[28], buf.V[20]);
286         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]);
287         buf.V[29] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[29], buf.V[28]);
288         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]);
289         buf.V[31] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[31], buf.V[29]);
290         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]);
291         buf.V[30] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[30], buf.V[31]);
292         pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]);
293         buf.V[26] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[26], buf.V[30]);
294         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]);
295         buf.V[27] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[27], buf.V[26]);
296         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]);
297         buf.V[25] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[25], buf.V[27]);
298         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]);
299         buf.V[24] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[24], buf.V[25]);
300         pre.V[4][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]);
301         buf.V[8] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[8], buf.V[24]);
302         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]);
303         buf.V[9] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[9], buf.V[8]);
304         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]);
305         buf.V[11] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[11], buf.V[9]);
306         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]);
307         buf.V[10] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[10], buf.V[11]);
308         pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]);
309         buf.V[14] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[14], buf.V[10]);
310         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]);
311         buf.V[15] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[15], buf.V[14]);
312         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]);
313         buf.V[13] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[13], buf.V[15]);
314         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]);
315         buf.V[12] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[12], buf.V[13]);
316         pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]);
317         buf.V[4] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[4], buf.V[12]);
318         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]);
319         buf.V[5] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[5], buf.V[4]);
320         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]);
321         buf.V[7] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[7], buf.V[5]);
322         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]);
323         buf.V[6] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[6], buf.V[7]);
324         pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]);
325         buf.V[2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[2], buf.V[6]);
326         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]);
327         buf.V[3] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[3], buf.V[2]);
328         pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]);
329         buf.V[1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[1], buf.V[3]);
330 
331         pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]);
332         t = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[0], buf.V[1]);
333 
334         if (i != GFBITS - 1) {
335             out128[i + 1][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(t, 1);
336         }
337         out128[i + 0][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(t, 0);
338 
339     }
340 
341     //
342 
343     for (j = 0; j < GFBITS; j++) {
344         tmp[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((beta[0] >> j) & 1);
345     }
346 
347     PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(tmp, pre.v[0], tmp);
348 
349     for (b = 0; b < GFBITS; b++) {
350         out128[b][1] = tmp[b];
351     }
352 
353     for (i = 1; i < 6; i++) {
354         for (j = 0; j < GFBITS; j++) {
355             tmp[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((beta[i] >> j) & 1);
356         }
357 
358         PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(tmp, pre.v[i], tmp);
359 
360         for (b = 0; b < GFBITS; b++) {
361             out128[b][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(out128[b][1], tmp[b]);
362         }
363     }
364 
365     for (b = 0; b < GFBITS; b++) {
366         v[0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][0], 0);
367         v[1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][0], 1);
368         v[2] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][1], 0);
369         v[3] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][1], 1);
370 
371         out[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]);
372     }
373 }
374 
PQCLEAN_MCELIECE8192128F_AVX_fft_tr(vec256 * out,vec256 in[][GFBITS])375 void PQCLEAN_MCELIECE8192128F_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) {
376     butterflies_tr(out, in);
377     radix_conversions_tr(out);
378 }
379 
380