1 /*
2   This file is for the inversion-free Berlekamp-Massey algorithm
3   see https://ieeexplore.ieee.org/document/87857
4 */
5 
6 #include "bm.h"
7 
8 #include "gf.h"
9 #include "util.h"
10 #include "vec.h"
11 #include "vec128.h"
12 
13 #include <stdint.h>
14 
15 extern void PQCLEAN_MCELIECE348864_AVX_update_asm(void *, gf, int);
16 extern gf PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm(uint64_t *);
17 
mask_nonzero(gf a)18 static inline uint64_t mask_nonzero(gf a) {
19     uint64_t ret = a;
20 
21     ret -= 1;
22     ret >>= 63;
23     ret -= 1;
24 
25     return ret;
26 }
27 
mask_leq(uint16_t a,uint16_t b)28 static inline uint64_t mask_leq(uint16_t a, uint16_t b) {
29     uint64_t a_tmp = a;
30     uint64_t b_tmp = b;
31     uint64_t ret = b_tmp - a_tmp;
32 
33     ret >>= 63;
34     ret -= 1;
35 
36     return ret;
37 }
38 
vec_cmov(uint64_t out[][2],uint64_t mask)39 static inline void vec_cmov(uint64_t out[][2], uint64_t mask) {
40     int i;
41 
42     for (i = 0; i < GFBITS; i++) {
43         out[i][0] = (out[i][0] & ~mask) | (out[i][1] & mask);
44     }
45 }
46 
interleave(vec128 * in,int idx0,int idx1,vec128 * mask,int b)47 static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) {
48     int s = 1 << b;
49 
50     vec128 x, y;
51 
52     x = PQCLEAN_MCELIECE348864_AVX_vec128_or(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx0], mask[0]),
53             PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx1], mask[0]), s));
54 
55     y = PQCLEAN_MCELIECE348864_AVX_vec128_or(PQCLEAN_MCELIECE348864_AVX_vec128_srl_2x(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx0], mask[1]), s),
56             PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx1], mask[1]));
57 
58     in[idx0] = x;
59     in[idx1] = y;
60 }
61 
62 /* input: in, field elements in bitsliced form */
63 /* output: out, field elements in non-bitsliced form */
get_coefs(gf * out,vec128 * in)64 static inline void get_coefs(gf *out, vec128 *in) {
65     int i, k;
66 
67     vec128 mask[4][2];
68     vec128 buf[16];
69 
70     for (i =  0; i < GFBITS; i++) {
71         buf[i] = in[i];
72     }
73     for (i = GFBITS; i < 16; i++) {
74         buf[i] = PQCLEAN_MCELIECE348864_AVX_vec128_setzero();
75     }
76 
77     mask[0][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x5555);
78     mask[0][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xAAAA);
79     mask[1][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x3333);
80     mask[1][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xCCCC);
81     mask[2][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x0F0F);
82     mask[2][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xF0F0);
83     mask[3][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x00FF);
84     mask[3][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xFF00);
85 
86     interleave(buf,  0,  8, mask[3], 3);
87     interleave(buf,  1,  9, mask[3], 3);
88     interleave(buf,  2, 10, mask[3], 3);
89     interleave(buf,  3, 11, mask[3], 3);
90     interleave(buf,  4, 12, mask[3], 3);
91     interleave(buf,  5, 13, mask[3], 3);
92     interleave(buf,  6, 14, mask[3], 3);
93     interleave(buf,  7, 15, mask[3], 3);
94 
95     interleave(buf,  0,  4, mask[2], 2);
96     interleave(buf,  1,  5, mask[2], 2);
97     interleave(buf,  2,  6, mask[2], 2);
98     interleave(buf,  3,  7, mask[2], 2);
99     interleave(buf,  8, 12, mask[2], 2);
100     interleave(buf,  9, 13, mask[2], 2);
101     interleave(buf, 10, 14, mask[2], 2);
102     interleave(buf, 11, 15, mask[2], 2);
103 
104     interleave(buf,  0,  2, mask[1], 1);
105     interleave(buf,  1,  3, mask[1], 1);
106     interleave(buf,  4,  6, mask[1], 1);
107     interleave(buf,  5,  7, mask[1], 1);
108     interleave(buf,  8, 10, mask[1], 1);
109     interleave(buf,  9, 11, mask[1], 1);
110     interleave(buf, 12, 14, mask[1], 1);
111     interleave(buf, 13, 15, mask[1], 1);
112 
113     interleave(buf,  0,  1, mask[0], 0);
114     interleave(buf,  2,  3, mask[0], 0);
115     interleave(buf,  4,  5, mask[0], 0);
116     interleave(buf,  6,  7, mask[0], 0);
117     interleave(buf,  8,  9, mask[0], 0);
118     interleave(buf, 10, 11, mask[0], 0);
119     interleave(buf, 12, 13, mask[0], 0);
120     interleave(buf, 14, 15, mask[0], 0);
121 
122     for (i = 0; i < 16; i++) {
123         for (k = 0; k <  4; k++) {
124             out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_AVX_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK;
125             out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_AVX_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK;
126         }
127     }
128 }
129 
130 /* input: in, field elements in bitsliced form */
131 /* output: out, field elements in non-bitsliced form */
PQCLEAN_MCELIECE348864_AVX_bm(uint64_t out[GFBITS],vec128 in[GFBITS])132 void PQCLEAN_MCELIECE348864_AVX_bm(uint64_t out[ GFBITS ], vec128 in[ GFBITS ]) {
133     uint16_t i;
134     uint16_t N, L;
135 
136     uint64_t prod[ GFBITS ];
137     uint64_t in_tmp[ GFBITS ];
138 
139     uint64_t db[ GFBITS ][ 2 ];
140     uint64_t BC_tmp[ GFBITS ][ 2 ];
141     uint64_t BC[ GFBITS ][ 2 ];
142 
143     uint64_t mask, t;
144 
145     gf d, b, c0 = 1;
146 
147     gf coefs[SYS_T * 2];
148 
149     // init
150 
151     BC[0][1] = 0;
152     BC[0][0] = 1;
153     BC[0][0] <<= 63;
154 
155     for (i = 1; i < GFBITS; i++) {
156         BC[i][0] = BC[i][1] = 0;
157     }
158 
159     b = 1;
160     L = 0;
161 
162     //
163 
164     get_coefs(coefs, in);
165 
166     for (i = 0; i < GFBITS; i++) {
167         in_tmp[i] = 0;
168     }
169 
170     for (N = 0; N < SYS_T * 2; N++) {
171         // computing d
172 
173         PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(prod, in_tmp, &BC[0][0]);
174 
175         PQCLEAN_MCELIECE348864_AVX_update_asm(in_tmp, coefs[N], 8);
176 
177         d = PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm(prod);
178 
179         t = PQCLEAN_MCELIECE348864_AVX_gf_mul2(c0, coefs[N], b);
180 
181         d ^= t & 0xFFFFFFFF;
182 
183         // 3 cases
184 
185         mask = mask_nonzero(d) & mask_leq(L * 2, N);
186 
187         for (i = 0; i < GFBITS; i++) {
188             db[i][0] = (d >> i) & 1;
189             db[i][0] = -db[i][0];
190             db[i][1] = (b >> i) & 1;
191             db[i][1] = -db[i][1];
192         }
193 
194         PQCLEAN_MCELIECE348864_AVX_vec128_mul((vec128 *) BC_tmp, (vec128 *) db, (vec128 *) BC);
195 
196         vec_cmov(BC, mask);
197 
198         PQCLEAN_MCELIECE348864_AVX_update_asm(BC, mask & c0, 16);
199 
200         for (i = 0; i < GFBITS; i++) {
201             BC[i][1] = BC_tmp[i][0] ^ BC_tmp[i][1];
202         }
203 
204         c0 = t >> 32;
205         b = (d & mask) | (b & ~mask);
206         L = ((N + 1 - L) & mask) | (L & ~mask);
207 
208     }
209 
210     c0 = PQCLEAN_MCELIECE348864_AVX_gf_inv(c0);
211 
212     for (i = 0; i < GFBITS; i++) {
213         out[i] = (c0 >> i) & 1;
214         out[i] = -out[i];
215     }
216 
217     PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(out, out, &BC[0][0]);
218 }
219 
220