1 /*
2 This file is for transpose of the Gao-Mateer FFT
3 Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c
4 */
5
6 #include "fft_tr.h"
7
8 #include "transpose.h"
9
10 #include <stdint.h>
11
radix_conversions_tr(vec256 * in)12 static void radix_conversions_tr(vec256 *in) {
13 int i, j, k;
14 vec256 t;
15 uint64_t v[4];
16
17 const vec256 mask[6][2] = {
18 {
19 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222),
20 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444)
21 },
22 {
23 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C),
24 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030)
25 },
26 {
27 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0),
28 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00)
29 },
30 {
31 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00),
32 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000)
33 },
34 {
35 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000),
36 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000)
37 },
38 {
39 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000),
40 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF)
41 }
42 };
43
44 const vec256 s[6][GFBITS] = {
45 #include "scalars_4x.inc"
46 };
47
48 //
49
50 for (j = 6; j >= 0; j--) {
51 if (j < 6) {
52 PQCLEAN_MCELIECE8192128F_AVX_vec256_mul(in, in, s[j]); // scaling
53 }
54
55 for (k = j; k <= 4; k++) {
56 for (i = 0; i < GFBITS; i++) {
57 t = PQCLEAN_MCELIECE8192128F_AVX_vec256_and(in[i], mask[k][0]);
58 t = PQCLEAN_MCELIECE8192128F_AVX_vec256_sll_4x(t, 1 << k);
59 in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(in[i], t);
60
61 t = PQCLEAN_MCELIECE8192128F_AVX_vec256_and(in[i], mask[k][1]);
62 t = PQCLEAN_MCELIECE8192128F_AVX_vec256_sll_4x(t, 1 << k);
63 in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(in[i], t);
64 }
65 }
66
67 if (j <= 5) {
68 for (i = 0; i < GFBITS; i++) {
69 v[0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 0);
70 v[1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 1);
71 v[2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 2);
72 v[3] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 3);
73
74 v[1] ^= v[0] >> 32;
75 v[1] ^= v[1] << 32;
76 v[3] ^= v[2] >> 32;
77 v[3] ^= v[3] << 32;
78
79 in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]);
80 }
81 }
82
83 for (i = 0; i < GFBITS; i++) {
84 v[0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 0);
85 v[1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 1);
86 v[2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 2);
87 v[3] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 3);
88
89 v[2] ^= v[1];
90 v[3] ^= v[2];
91
92 in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]);
93 }
94
95 }
96 }
97
butterflies_tr(vec256 * out,vec256 in[][GFBITS])98 static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) {
99 int i, j, k, s, b;
100
101 vec256 t0[ GFBITS ];
102 vec256 t1[ GFBITS ];
103 vec256 t;
104
105 vec128 out128[ GFBITS ][ 2 ];
106 vec128 tmp[ GFBITS ];
107
108 union {
109 vec128 v[6][ GFBITS + 1 ];
110 vec256 V[6][ (GFBITS + 1) / 2 ];
111 } pre;
112
113 union {
114 vec128 v[64][ 2 ];
115 vec256 V[64];
116 } buf;
117
118 const vec256 consts[ 33 ][ GFBITS ] = {
119 #include "consts.inc"
120 };
121
122 uint64_t v[4];
123 uint64_t consts_ptr = 33;
124
125 const unsigned char reversal[] = {
126 0, 32, 16, 48, 8, 40, 24, 56,
127 4, 36, 20, 52, 12, 44, 28, 60,
128 2, 34, 18, 50, 10, 42, 26, 58,
129 6, 38, 22, 54, 14, 46, 30, 62,
130 1, 33, 17, 49, 9, 41, 25, 57,
131 5, 37, 21, 53, 13, 45, 29, 61,
132 3, 35, 19, 51, 11, 43, 27, 59,
133 7, 39, 23, 55, 15, 47, 31, 63
134 };
135
136 const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755};
137
138 // butterflies
139
140 for (i = 4; i >= 0; i--) {
141 s = 1 << i;
142 consts_ptr -= s;
143
144 for (j = 0; j < 32; j += 2 * s) {
145 for (k = j; k < j + s; k++) {
146 PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]);
147 }
148 }
149
150 }
151
152 for (k = 0; k < 32; k += 2) {
153 for (b = 0; b < GFBITS; b++) {
154 t0[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]);
155 }
156 for (b = 0; b < GFBITS; b++) {
157 t1[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]);
158 }
159
160 PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(t0, t1, consts[1]);
161
162 for (b = 0; b < GFBITS; b++) {
163 in[k][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low(t0[b], t1[b]);
164 }
165 for (b = 0; b < GFBITS; b++) {
166 in[k + 1][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high(t0[b], t1[b]);
167 }
168
169 for (b = 0; b < GFBITS; b++) {
170 t0[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]);
171 }
172 for (b = 0; b < GFBITS; b++) {
173 t1[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]);
174 }
175
176 PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(t0, t1, consts[0]);
177
178 for (b = 0; b < GFBITS; b++) {
179 in[k + 0][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low_2x(t0[b], t1[b]);
180 }
181 for (b = 0; b < GFBITS; b++) {
182 in[k + 1][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high_2x(t0[b], t1[b]);
183 }
184 }
185
186
187 // boradcast
188
189 for (i = 0; i < GFBITS; i += 2) {
190 // transpose
191
192 for (k = 0; k < 32; k++) {
193 if (i != GFBITS - 1) {
194 buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(in[ k ][i + 1], 0);
195 buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(in[ k ][i + 1], 1);
196 }
197
198 buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(in[ k ][i + 0], 0);
199 buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(in[ k ][i + 0], 1);
200 }
201
202 PQCLEAN_MCELIECE8192128F_AVX_transpose_64x256_sp(buf.V);
203
204 //
205
206 pre.V[0][i / 2] = buf.V[32];
207 buf.V[33] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[33], buf.V[32]);
208 pre.V[1][i / 2] = buf.V[33];
209 buf.V[35] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[35], buf.V[33]);
210 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]);
211 buf.V[34] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[34], buf.V[35]);
212 pre.V[2][i / 2] = buf.V[34];
213 buf.V[38] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[38], buf.V[34]);
214 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]);
215 buf.V[39] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[39], buf.V[38]);
216 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]);
217 buf.V[37] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[37], buf.V[39]);
218 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]);
219 buf.V[36] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[36], buf.V[37]);
220 pre.V[3][i / 2] = buf.V[36];
221 buf.V[44] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[44], buf.V[36]);
222 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]);
223 buf.V[45] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[45], buf.V[44]);
224 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]);
225 buf.V[47] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[47], buf.V[45]);
226 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]);
227 buf.V[46] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[46], buf.V[47]);
228 pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]);
229 buf.V[42] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[42], buf.V[46]);
230 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]);
231 buf.V[43] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[43], buf.V[42]);
232 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]);
233 buf.V[41] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[41], buf.V[43]);
234 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]);
235 buf.V[40] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[40], buf.V[41]);
236 pre.V[4][i / 2] = buf.V[40];
237 buf.V[56] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[56], buf.V[40]);
238 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]);
239 buf.V[57] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[57], buf.V[56]);
240 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]);
241 buf.V[59] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[59], buf.V[57]);
242 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]);
243 buf.V[58] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[58], buf.V[59]);
244 pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]);
245 buf.V[62] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[62], buf.V[58]);
246 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]);
247 buf.V[63] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[63], buf.V[62]);
248 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]);
249 buf.V[61] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[61], buf.V[63]);
250 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]);
251 buf.V[60] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[60], buf.V[61]);
252 pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]);
253 buf.V[52] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[52], buf.V[60]);
254 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]);
255 buf.V[53] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[53], buf.V[52]);
256 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]);
257 buf.V[55] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[55], buf.V[53]);
258 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]);
259 buf.V[54] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[54], buf.V[55]);
260 pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]);
261 buf.V[50] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[50], buf.V[54]);
262 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]);
263 buf.V[51] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[51], buf.V[50]);
264 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]);
265 buf.V[49] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[49], buf.V[51]);
266 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]);
267 buf.V[48] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[48], buf.V[49]);
268 pre.V[5][i / 2] = buf.V[48];
269 buf.V[16] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[16], buf.V[48]);
270 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]);
271 buf.V[17] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[17], buf.V[16]);
272 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]);
273 buf.V[19] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[19], buf.V[17]);
274 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]);
275 buf.V[18] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[18], buf.V[19]);
276 pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]);
277 buf.V[22] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[22], buf.V[18]);
278 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]);
279 buf.V[23] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[23], buf.V[22]);
280 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]);
281 buf.V[21] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[21], buf.V[23]);
282 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]);
283 buf.V[20] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[20], buf.V[21]);
284 pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]);
285 buf.V[28] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[28], buf.V[20]);
286 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]);
287 buf.V[29] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[29], buf.V[28]);
288 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]);
289 buf.V[31] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[31], buf.V[29]);
290 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]);
291 buf.V[30] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[30], buf.V[31]);
292 pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]);
293 buf.V[26] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[26], buf.V[30]);
294 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]);
295 buf.V[27] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[27], buf.V[26]);
296 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]);
297 buf.V[25] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[25], buf.V[27]);
298 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]);
299 buf.V[24] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[24], buf.V[25]);
300 pre.V[4][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]);
301 buf.V[8] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[8], buf.V[24]);
302 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]);
303 buf.V[9] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[9], buf.V[8]);
304 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]);
305 buf.V[11] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[11], buf.V[9]);
306 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]);
307 buf.V[10] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[10], buf.V[11]);
308 pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]);
309 buf.V[14] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[14], buf.V[10]);
310 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]);
311 buf.V[15] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[15], buf.V[14]);
312 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]);
313 buf.V[13] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[13], buf.V[15]);
314 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]);
315 buf.V[12] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[12], buf.V[13]);
316 pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]);
317 buf.V[4] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[4], buf.V[12]);
318 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]);
319 buf.V[5] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[5], buf.V[4]);
320 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]);
321 buf.V[7] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[7], buf.V[5]);
322 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]);
323 buf.V[6] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[6], buf.V[7]);
324 pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]);
325 buf.V[2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[2], buf.V[6]);
326 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]);
327 buf.V[3] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[3], buf.V[2]);
328 pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]);
329 buf.V[1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[1], buf.V[3]);
330
331 pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]);
332 t = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[0], buf.V[1]);
333
334 if (i != GFBITS - 1) {
335 out128[i + 1][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(t, 1);
336 }
337 out128[i + 0][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(t, 0);
338
339 }
340
341 //
342
343 for (j = 0; j < GFBITS; j++) {
344 tmp[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((beta[0] >> j) & 1);
345 }
346
347 PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(tmp, pre.v[0], tmp);
348
349 for (b = 0; b < GFBITS; b++) {
350 out128[b][1] = tmp[b];
351 }
352
353 for (i = 1; i < 6; i++) {
354 for (j = 0; j < GFBITS; j++) {
355 tmp[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((beta[i] >> j) & 1);
356 }
357
358 PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(tmp, pre.v[i], tmp);
359
360 for (b = 0; b < GFBITS; b++) {
361 out128[b][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(out128[b][1], tmp[b]);
362 }
363 }
364
365 for (b = 0; b < GFBITS; b++) {
366 v[0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][0], 0);
367 v[1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][0], 1);
368 v[2] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][1], 0);
369 v[3] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][1], 1);
370
371 out[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]);
372 }
373 }
374
PQCLEAN_MCELIECE8192128F_AVX_fft_tr(vec256 * out,vec256 in[][GFBITS])375 void PQCLEAN_MCELIECE8192128F_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) {
376 butterflies_tr(out, in);
377 radix_conversions_tr(out);
378 }
379
380