1 /*
2   This file is for Benes network related functions
3 */
4 
5 #include "benes.h"
6 
7 #include "params.h"
8 #include "transpose.h"
9 #include "util.h"
10 
layer_x(vec128 * data,vec128 * bits)11 static void layer_x(vec128 *data, vec128 *bits) {
12     int i;
13     vec128 v0, v1;
14     vec128 d;
15 
16     for (i = 0; i < 64; i += 2) {
17         v0 = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(data[i + 0], data[i + 1]);
18         v1 = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(data[i + 0], data[i + 1]);
19 
20         d = PQCLEAN_MCELIECE460896_AVX_vec128_xor(v0, v1);
21         d = PQCLEAN_MCELIECE460896_AVX_vec128_and(d, *bits++);
22         v0 = PQCLEAN_MCELIECE460896_AVX_vec128_xor(v0, d);
23         v1 = PQCLEAN_MCELIECE460896_AVX_vec128_xor(v1, d);
24 
25         data[i + 0] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(v0, v1);
26         data[i + 1] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(v0, v1);
27     }
28 }
29 
layer_0(vec128 * bs,vec128 * cond)30 static void layer_0(vec128 *bs, vec128 *cond) {
31     int x;
32     vec128 diff;
33 
34     for (x = 0; x < (1 << 6); x += 2) {
35         diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]);
36         diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, *cond++);
37         bs[ x ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x ], diff);
38         bs[ x + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], diff);
39     }
40 }
41 
layer_1(vec128 * bs,vec128 * cond)42 static void layer_1(vec128 *bs, vec128 *cond) {
43     int x;
44     vec128 diff;
45 
46     for (x = 0; x < (1 << 6); x += 4) {
47         diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]);
48         diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]);
49         bs[ x + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], diff);
50         bs[ x + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 2 ], diff);
51 
52         diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]);
53         diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]);
54         bs[ x + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], diff);
55         bs[ x + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 3 ], diff);
56 
57         cond += 2;
58     }
59 }
60 
layer_2(vec128 * bs,vec128 * cond)61 static void layer_2(vec128 *bs, vec128 *cond) {
62     int x;
63     vec128 diff;
64 
65     for (x = 0; x < (1 << 6); x += 8) {
66         diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]);
67         diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]);
68         bs[ x + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], diff);
69         bs[ x + 4 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 4 ], diff);
70 
71         diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]);
72         diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]);
73         bs[ x + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], diff);
74         bs[ x + 5 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 5 ], diff);
75 
76         diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]);
77         diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]);
78         bs[ x + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 2 ], diff);
79         bs[ x + 6 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 6 ], diff);
80 
81         diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]);
82         diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]);
83         bs[ x + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 3 ], diff);
84         bs[ x + 7 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 7 ], diff);
85 
86         cond += 4;
87     }
88 }
89 
layer_3(vec128 * bs,vec128 * cond)90 static void layer_3(vec128 *bs, vec128 *cond) {
91     int x, s;
92     vec128 diff;
93 
94     for (x = 0; x < (1 << 6); x += 16) {
95         for (s = x; s < x + 8; s += 4) {
96             diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]);
97             diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]);
98             bs[ s + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], diff);
99             bs[ s + 8 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 8 ], diff);
100 
101             diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]);
102             diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]);
103             bs[ s + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], diff);
104             bs[ s + 9 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 9 ], diff);
105 
106             diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]);
107             diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]);
108             bs[ s + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], diff);
109             bs[ s + 10 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 10 ], diff);
110 
111             diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]);
112             diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]);
113             bs[ s + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], diff);
114             bs[ s + 11 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 11 ], diff);
115 
116             cond += 4;
117         }
118     }
119 }
120 
layer_4(vec128 * bs,vec128 * cond)121 static void layer_4(vec128 *bs, vec128 *cond) {
122     int x, s;
123     vec128 diff;
124 
125     for (x = 0; x < (1 << 6); x += 32) {
126         for (s = x; s < x + 16; s += 4) {
127             diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]);
128             diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]);
129             bs[ s + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], diff);
130             bs[ s + 16 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 16 ], diff);
131 
132             diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]);
133             diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]);
134             bs[ s + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], diff);
135             bs[ s + 17 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 17 ], diff);
136 
137             diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]);
138             diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]);
139             bs[ s + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], diff);
140             bs[ s + 18 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 18 ], diff);
141 
142             diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]);
143             diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]);
144             bs[ s + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], diff);
145             bs[ s + 19 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 19 ], diff);
146 
147             cond += 4;
148         }
149     }
150 }
151 
layer_5(vec128 * bs,vec128 * cond)152 static void layer_5(vec128 *bs, vec128 *cond) {
153     int x, s;
154     vec128 diff;
155 
156     for (x = 0; x < (1 << 6); x += 64) {
157         for (s = x; s < x + 32; s += 4) {
158             diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]);
159             diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]);
160             bs[ s + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], diff);
161             bs[ s + 32 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 32 ], diff);
162 
163             diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]);
164             diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]);
165             bs[ s + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], diff);
166             bs[ s + 33 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 33 ], diff);
167 
168             diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]);
169             diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]);
170             bs[ s + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], diff);
171             bs[ s + 34 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 34 ], diff);
172 
173             diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]);
174             diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]);
175             bs[ s + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], diff);
176             bs[ s + 35 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 35 ], diff);
177 
178             cond += 4;
179         }
180     }
181 }
182 
183 /* input: bits, control bits as array of bytes */
184 /* output: bits_int, control bits as array of 128-bit vectors */
PQCLEAN_MCELIECE460896_AVX_load_bits(vec128 bits_int[][32],const unsigned char * bits)185 void PQCLEAN_MCELIECE460896_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) {
186     int i, j;
187     const unsigned char *ptr = bits;
188 
189     vec128 buf[64];
190 
191     for (i = 0; i <= 5; i += 2) {
192         for (j = 0; j < 64; j++) {
193             buf[j] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(PQCLEAN_MCELIECE460896_AVX_load8(ptr), PQCLEAN_MCELIECE460896_AVX_load8(ptr + 512));
194             ptr += 8;
195         }
196 
197         PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( buf );
198 
199         for (j = 0; j < 32; j++) {
200             bits_int[i + 0][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(buf[j], buf[j + 32]);
201             bits_int[i + 1][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(buf[j], buf[j + 32]);
202         }
203 
204         ptr += 512;
205     }
206 
207     for (i = 6; i <= 18; i++) {
208         for (j = 0; j < 32; j++) {
209             bits_int[i][j] = PQCLEAN_MCELIECE460896_AVX_load16(ptr);
210             ptr += 16;
211         }
212     }
213 
214     for (i = 19; i < 25; i += 2) {
215         for (j = 0; j < 64; j++) {
216             buf[j] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(PQCLEAN_MCELIECE460896_AVX_load8(ptr), PQCLEAN_MCELIECE460896_AVX_load8(ptr + 512));
217             ptr += 8;
218         }
219 
220         PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( buf );
221 
222         for (j = 0; j < 32; j++) {
223             bits_int[i + 0][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(buf[j], buf[j + 32]);
224             bits_int[i + 1][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(buf[j], buf[j + 32]);
225         }
226 
227         ptr += 512;
228     }
229 }
230 
231 /* input: r, sequence of bits to be permuted */
232 /*        b, control bits as array of 128-bit vectors  */
233 /*        rev, 0 for normal application; !0 for inverse */
234 /* output: r, permuted bits */
PQCLEAN_MCELIECE460896_AVX_benes(vec128 * r,vec128 b[][32],int rev)235 void PQCLEAN_MCELIECE460896_AVX_benes(vec128 *r, vec128 b[][32], int rev) {
236     int inc;
237 
238     vec128 *b_ptr;
239 
240     if (rev == 0) {
241         inc =  32;
242         b_ptr = b[ 0];
243     } else          {
244         inc = -32;
245         b_ptr = b[24];
246     }
247 
248     //
249 
250     PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r );
251 
252     layer_0(r, b_ptr);
253     b_ptr += inc;
254     layer_1(r, b_ptr);
255     b_ptr += inc;
256     layer_2(r, b_ptr);
257     b_ptr += inc;
258     layer_3(r, b_ptr);
259     b_ptr += inc;
260     layer_4(r, b_ptr);
261     b_ptr += inc;
262     layer_5(r, b_ptr);
263     b_ptr += inc;
264 
265     PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r );
266 
267     layer_x(r, b_ptr);
268     b_ptr += inc;
269     layer_0(r, b_ptr);
270     b_ptr += inc;
271     layer_1(r, b_ptr);
272     b_ptr += inc;
273     layer_2(r, b_ptr);
274     b_ptr += inc;
275     layer_3(r, b_ptr);
276     b_ptr += inc;
277     layer_4(r, b_ptr);
278     b_ptr += inc;
279     layer_5(r, b_ptr);
280     b_ptr += inc;
281     layer_4(r, b_ptr);
282     b_ptr += inc;
283     layer_3(r, b_ptr);
284     b_ptr += inc;
285     layer_2(r, b_ptr);
286     b_ptr += inc;
287     layer_1(r, b_ptr);
288     b_ptr += inc;
289     layer_0(r, b_ptr);
290     b_ptr += inc;
291     layer_x(r, b_ptr);
292     b_ptr += inc;
293 
294     PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r );
295 
296     layer_5(r, b_ptr);
297     b_ptr += inc;
298     layer_4(r, b_ptr);
299     b_ptr += inc;
300     layer_3(r, b_ptr);
301     b_ptr += inc;
302     layer_2(r, b_ptr);
303     b_ptr += inc;
304     layer_1(r, b_ptr);
305     b_ptr += inc;
306     layer_0(r, b_ptr);
307     //b_ptr += inc;
308 
309     PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r );
310 }
311 
312