1 /*
2 This file is for Benes network related functions
3 */
4
5 #include "benes.h"
6
7 #include "params.h"
8 #include "transpose.h"
9 #include "util.h"
10
layer_x(vec128 * data,vec128 * bits)11 static void layer_x(vec128 *data, vec128 *bits) {
12 int i;
13 vec128 v0, v1;
14 vec128 d;
15
16 for (i = 0; i < 64; i += 2) {
17 v0 = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(data[i + 0], data[i + 1]);
18 v1 = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(data[i + 0], data[i + 1]);
19
20 d = PQCLEAN_MCELIECE460896_AVX_vec128_xor(v0, v1);
21 d = PQCLEAN_MCELIECE460896_AVX_vec128_and(d, *bits++);
22 v0 = PQCLEAN_MCELIECE460896_AVX_vec128_xor(v0, d);
23 v1 = PQCLEAN_MCELIECE460896_AVX_vec128_xor(v1, d);
24
25 data[i + 0] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(v0, v1);
26 data[i + 1] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(v0, v1);
27 }
28 }
29
layer_0(vec128 * bs,vec128 * cond)30 static void layer_0(vec128 *bs, vec128 *cond) {
31 int x;
32 vec128 diff;
33
34 for (x = 0; x < (1 << 6); x += 2) {
35 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]);
36 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, *cond++);
37 bs[ x ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x ], diff);
38 bs[ x + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], diff);
39 }
40 }
41
layer_1(vec128 * bs,vec128 * cond)42 static void layer_1(vec128 *bs, vec128 *cond) {
43 int x;
44 vec128 diff;
45
46 for (x = 0; x < (1 << 6); x += 4) {
47 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]);
48 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]);
49 bs[ x + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], diff);
50 bs[ x + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 2 ], diff);
51
52 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]);
53 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]);
54 bs[ x + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], diff);
55 bs[ x + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 3 ], diff);
56
57 cond += 2;
58 }
59 }
60
layer_2(vec128 * bs,vec128 * cond)61 static void layer_2(vec128 *bs, vec128 *cond) {
62 int x;
63 vec128 diff;
64
65 for (x = 0; x < (1 << 6); x += 8) {
66 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]);
67 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]);
68 bs[ x + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], diff);
69 bs[ x + 4 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 4 ], diff);
70
71 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]);
72 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]);
73 bs[ x + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], diff);
74 bs[ x + 5 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 5 ], diff);
75
76 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]);
77 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]);
78 bs[ x + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 2 ], diff);
79 bs[ x + 6 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 6 ], diff);
80
81 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]);
82 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]);
83 bs[ x + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 3 ], diff);
84 bs[ x + 7 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 7 ], diff);
85
86 cond += 4;
87 }
88 }
89
layer_3(vec128 * bs,vec128 * cond)90 static void layer_3(vec128 *bs, vec128 *cond) {
91 int x, s;
92 vec128 diff;
93
94 for (x = 0; x < (1 << 6); x += 16) {
95 for (s = x; s < x + 8; s += 4) {
96 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]);
97 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]);
98 bs[ s + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], diff);
99 bs[ s + 8 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 8 ], diff);
100
101 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]);
102 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]);
103 bs[ s + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], diff);
104 bs[ s + 9 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 9 ], diff);
105
106 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]);
107 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]);
108 bs[ s + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], diff);
109 bs[ s + 10 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 10 ], diff);
110
111 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]);
112 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]);
113 bs[ s + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], diff);
114 bs[ s + 11 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 11 ], diff);
115
116 cond += 4;
117 }
118 }
119 }
120
layer_4(vec128 * bs,vec128 * cond)121 static void layer_4(vec128 *bs, vec128 *cond) {
122 int x, s;
123 vec128 diff;
124
125 for (x = 0; x < (1 << 6); x += 32) {
126 for (s = x; s < x + 16; s += 4) {
127 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]);
128 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]);
129 bs[ s + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], diff);
130 bs[ s + 16 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 16 ], diff);
131
132 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]);
133 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]);
134 bs[ s + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], diff);
135 bs[ s + 17 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 17 ], diff);
136
137 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]);
138 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]);
139 bs[ s + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], diff);
140 bs[ s + 18 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 18 ], diff);
141
142 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]);
143 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]);
144 bs[ s + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], diff);
145 bs[ s + 19 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 19 ], diff);
146
147 cond += 4;
148 }
149 }
150 }
151
layer_5(vec128 * bs,vec128 * cond)152 static void layer_5(vec128 *bs, vec128 *cond) {
153 int x, s;
154 vec128 diff;
155
156 for (x = 0; x < (1 << 6); x += 64) {
157 for (s = x; s < x + 32; s += 4) {
158 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]);
159 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]);
160 bs[ s + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], diff);
161 bs[ s + 32 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 32 ], diff);
162
163 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]);
164 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]);
165 bs[ s + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], diff);
166 bs[ s + 33 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 33 ], diff);
167
168 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]);
169 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]);
170 bs[ s + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], diff);
171 bs[ s + 34 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 34 ], diff);
172
173 diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]);
174 diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]);
175 bs[ s + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], diff);
176 bs[ s + 35 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 35 ], diff);
177
178 cond += 4;
179 }
180 }
181 }
182
183 /* input: bits, control bits as array of bytes */
184 /* output: bits_int, control bits as array of 128-bit vectors */
PQCLEAN_MCELIECE460896_AVX_load_bits(vec128 bits_int[][32],const unsigned char * bits)185 void PQCLEAN_MCELIECE460896_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) {
186 int i, j;
187 const unsigned char *ptr = bits;
188
189 vec128 buf[64];
190
191 for (i = 0; i <= 5; i += 2) {
192 for (j = 0; j < 64; j++) {
193 buf[j] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(PQCLEAN_MCELIECE460896_AVX_load8(ptr), PQCLEAN_MCELIECE460896_AVX_load8(ptr + 512));
194 ptr += 8;
195 }
196
197 PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( buf );
198
199 for (j = 0; j < 32; j++) {
200 bits_int[i + 0][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(buf[j], buf[j + 32]);
201 bits_int[i + 1][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(buf[j], buf[j + 32]);
202 }
203
204 ptr += 512;
205 }
206
207 for (i = 6; i <= 18; i++) {
208 for (j = 0; j < 32; j++) {
209 bits_int[i][j] = PQCLEAN_MCELIECE460896_AVX_load16(ptr);
210 ptr += 16;
211 }
212 }
213
214 for (i = 19; i < 25; i += 2) {
215 for (j = 0; j < 64; j++) {
216 buf[j] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(PQCLEAN_MCELIECE460896_AVX_load8(ptr), PQCLEAN_MCELIECE460896_AVX_load8(ptr + 512));
217 ptr += 8;
218 }
219
220 PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( buf );
221
222 for (j = 0; j < 32; j++) {
223 bits_int[i + 0][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(buf[j], buf[j + 32]);
224 bits_int[i + 1][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(buf[j], buf[j + 32]);
225 }
226
227 ptr += 512;
228 }
229 }
230
231 /* input: r, sequence of bits to be permuted */
232 /* b, control bits as array of 128-bit vectors */
233 /* rev, 0 for normal application; !0 for inverse */
234 /* output: r, permuted bits */
PQCLEAN_MCELIECE460896_AVX_benes(vec128 * r,vec128 b[][32],int rev)235 void PQCLEAN_MCELIECE460896_AVX_benes(vec128 *r, vec128 b[][32], int rev) {
236 int inc;
237
238 vec128 *b_ptr;
239
240 if (rev == 0) {
241 inc = 32;
242 b_ptr = b[ 0];
243 } else {
244 inc = -32;
245 b_ptr = b[24];
246 }
247
248 //
249
250 PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r );
251
252 layer_0(r, b_ptr);
253 b_ptr += inc;
254 layer_1(r, b_ptr);
255 b_ptr += inc;
256 layer_2(r, b_ptr);
257 b_ptr += inc;
258 layer_3(r, b_ptr);
259 b_ptr += inc;
260 layer_4(r, b_ptr);
261 b_ptr += inc;
262 layer_5(r, b_ptr);
263 b_ptr += inc;
264
265 PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r );
266
267 layer_x(r, b_ptr);
268 b_ptr += inc;
269 layer_0(r, b_ptr);
270 b_ptr += inc;
271 layer_1(r, b_ptr);
272 b_ptr += inc;
273 layer_2(r, b_ptr);
274 b_ptr += inc;
275 layer_3(r, b_ptr);
276 b_ptr += inc;
277 layer_4(r, b_ptr);
278 b_ptr += inc;
279 layer_5(r, b_ptr);
280 b_ptr += inc;
281 layer_4(r, b_ptr);
282 b_ptr += inc;
283 layer_3(r, b_ptr);
284 b_ptr += inc;
285 layer_2(r, b_ptr);
286 b_ptr += inc;
287 layer_1(r, b_ptr);
288 b_ptr += inc;
289 layer_0(r, b_ptr);
290 b_ptr += inc;
291 layer_x(r, b_ptr);
292 b_ptr += inc;
293
294 PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r );
295
296 layer_5(r, b_ptr);
297 b_ptr += inc;
298 layer_4(r, b_ptr);
299 b_ptr += inc;
300 layer_3(r, b_ptr);
301 b_ptr += inc;
302 layer_2(r, b_ptr);
303 b_ptr += inc;
304 layer_1(r, b_ptr);
305 b_ptr += inc;
306 layer_0(r, b_ptr);
307 //b_ptr += inc;
308
309 PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r );
310 }
311
312