1 /*
2   This file is for the inversion-free Berlekamp-Massey algorithm
3   see https://ieeexplore.ieee.org/document/87857
4 */
5 
6 #include "bm.h"
7 
8 #include "gf.h"
9 
mask_nonzero(gf a)10 static inline uint16_t mask_nonzero(gf a) {
11     uint32_t ret = a;
12 
13     ret -= 1;
14     ret >>= 31;
15     ret -= 1;
16 
17     return (uint16_t)ret;
18 }
19 
mask_leq(uint16_t a,uint16_t b)20 static inline uint16_t mask_leq(uint16_t a, uint16_t b) {
21     uint32_t a_tmp = a;
22     uint32_t b_tmp = b;
23     uint32_t ret = b_tmp - a_tmp;
24 
25     ret >>= 31;
26     ret -= 1;
27 
28     return (uint16_t)ret;
29 }
30 
vec_cmov(vec * out,const vec * in,uint16_t mask)31 static inline void vec_cmov(vec *out, const vec *in, uint16_t mask) {
32     int i;
33 
34     vec m0, m1;
35 
36     m0 = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(mask);
37     m1 = ~m0;
38 
39     for (i = 0; i < GFBITS; i++) {
40         out[i] = (in[i] & m0) | (out[i] & m1);
41         out[i] = (in[i] & m0) | (out[i] & m1);
42     }
43 }
44 
interleave(vec * in,int idx0,int idx1,const vec * mask,int b)45 static inline void interleave(vec *in, int idx0, int idx1, const vec *mask, int b) {
46     int s = 1 << b;
47 
48     vec x, y;
49 
50     x = (in[idx0] & mask[0]) | ((in[idx1] & mask[0]) << s);
51     y = ((in[idx0] & mask[1]) >> s) | (in[idx1] & mask[1]);
52 
53     in[idx0] = x;
54     in[idx1] = y;
55 }
56 
57 /* input: in, field elements in bitsliced form */
58 /* output: out, field elements in non-bitsliced form */
get_coefs(gf * out,const vec * in)59 static inline void get_coefs(gf *out, const vec *in) {
60     int i, k;
61 
62     vec mask[4][2];
63     vec buf[16];
64 
65     for (i =  0; i < 13; i++) {
66         buf[i] = in[i];
67     }
68     for (i = 13; i < 16; i++) {
69         buf[i] = 0;
70     }
71 
72     mask[0][0] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0x5555);
73     mask[0][1] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0xAAAA);
74     mask[1][0] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0x3333);
75     mask[1][1] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0xCCCC);
76     mask[2][0] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0x0F0F);
77     mask[2][1] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0xF0F0);
78     mask[3][0] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0x00FF);
79     mask[3][1] = PQCLEAN_MCELIECE8192128F_VEC_vec_set1_16b(0xFF00);
80 
81     interleave(buf,  0,  8, mask[3], 3);
82     interleave(buf,  1,  9, mask[3], 3);
83     interleave(buf,  2, 10, mask[3], 3);
84     interleave(buf,  3, 11, mask[3], 3);
85     interleave(buf,  4, 12, mask[3], 3);
86     interleave(buf,  5, 13, mask[3], 3);
87     interleave(buf,  6, 14, mask[3], 3);
88     interleave(buf,  7, 15, mask[3], 3);
89 
90     interleave(buf,  0,  4, mask[2], 2);
91     interleave(buf,  1,  5, mask[2], 2);
92     interleave(buf,  2,  6, mask[2], 2);
93     interleave(buf,  3,  7, mask[2], 2);
94     interleave(buf,  8, 12, mask[2], 2);
95     interleave(buf,  9, 13, mask[2], 2);
96     interleave(buf, 10, 14, mask[2], 2);
97     interleave(buf, 11, 15, mask[2], 2);
98 
99     interleave(buf,  0,  2, mask[1], 1);
100     interleave(buf,  1,  3, mask[1], 1);
101     interleave(buf,  4,  6, mask[1], 1);
102     interleave(buf,  5,  7, mask[1], 1);
103     interleave(buf,  8, 10, mask[1], 1);
104     interleave(buf,  9, 11, mask[1], 1);
105     interleave(buf, 12, 14, mask[1], 1);
106     interleave(buf, 13, 15, mask[1], 1);
107 
108     interleave(buf,  0,  1, mask[0], 0);
109     interleave(buf,  2,  3, mask[0], 0);
110     interleave(buf,  4,  5, mask[0], 0);
111     interleave(buf,  6,  7, mask[0], 0);
112     interleave(buf,  8,  9, mask[0], 0);
113     interleave(buf, 10, 11, mask[0], 0);
114     interleave(buf, 12, 13, mask[0], 0);
115     interleave(buf, 14, 15, mask[0], 0);
116 
117     for (i = 0; i < 16; i++) {
118         for (k = 0; k <  4; k++) {
119             out[ k * 16 + i ] = (buf[i] >> (k * 16)) & GFMASK;
120         }
121     }
122 }
123 
update(vec in[][GFBITS],const gf e)124 static void update(vec in[][GFBITS], const gf e) {
125     int i;
126     vec tmp;
127 
128     for (i = 0; i < GFBITS; i++) {
129         tmp = (e >> i) & 1;
130 
131         in[0][i] = (in[0][i] >> 1) | (in[1][i] << 63);
132         in[1][i] = (in[1][i] >> 1) | (tmp      << 63);
133     }
134 }
135 
vec_reduce(vec in[][GFBITS])136 static inline gf vec_reduce(vec in[][GFBITS]) {
137     int i;
138     vec tmp;
139     gf ret = 0;
140 
141     for (i = GFBITS - 1; i >= 0; i--) {
142         tmp = in[0][i] ^ in[1][i];
143 
144         tmp ^= tmp >> 32;
145         tmp ^= tmp >> 16;
146         tmp ^= tmp >> 8;
147         tmp ^= tmp >> 4;
148         tmp ^= tmp >> 2;
149         tmp ^= tmp >> 1;
150 
151         ret <<= 1;
152         ret |= tmp & 1;
153     }
154 
155     return ret;
156 }
157 
158 /* input: in, sequence of field elements */
159 /* output: out, minimal polynomial of in */
PQCLEAN_MCELIECE8192128F_VEC_bm(vec out[][GFBITS],vec in[][GFBITS])160 void PQCLEAN_MCELIECE8192128F_VEC_bm(vec out[][ GFBITS ], vec in[][ GFBITS ]) {
161     int i;
162     uint16_t N, L;
163     uint16_t mask;
164     uint64_t one = 1, t;
165 
166     vec prod[2][GFBITS];
167     vec interval[2][GFBITS];
168     vec dd[2][GFBITS], bb[2][GFBITS];
169     vec B[2][GFBITS], C[2][GFBITS];
170     vec B_tmp[2][GFBITS], C_tmp[2][GFBITS];
171     vec v[GFBITS];
172 
173     gf d, b, c0 = 1;
174     gf coefs[256];
175 
176     // initialization
177 
178     get_coefs(&coefs[  0], in[0]);
179     get_coefs(&coefs[ 64], in[1]);
180     get_coefs(&coefs[128], in[2]);
181     get_coefs(&coefs[192], in[3]);
182 
183     C[0][0] = 0;
184     C[1][0] = 0;
185     B[0][0] = 0;
186     B[1][0] = one << 63;
187 
188     for (i = 1; i < GFBITS; i++) {
189         C[0][i] = C[1][i] = B[0][i] = B[1][i] = 0;
190     }
191 
192     b = 1;
193     L = 0;
194 
195     //
196 
197     for (i = 0; i < GFBITS; i++) {
198         interval[0][i] = interval[1][i] = 0;
199     }
200 
201     for (N = 0; N < 256; N++) {
202         PQCLEAN_MCELIECE8192128F_VEC_vec_mul(prod[0], C[0], interval[0]);
203         PQCLEAN_MCELIECE8192128F_VEC_vec_mul(prod[1], C[1], interval[1]);
204         update(interval, coefs[N]);
205         d = vec_reduce(prod);
206 
207         t = PQCLEAN_MCELIECE8192128F_VEC_gf_mul2(c0, coefs[N], b);
208         d ^= t & 0xFFFFFFFF;
209 
210         mask = mask_nonzero(d) & mask_leq(L * 2, N);
211 
212         for (i = 0; i < GFBITS; i++) {
213             dd[0][i] = dd[1][i] = PQCLEAN_MCELIECE8192128F_VEC_vec_setbits((d >> i) & 1);
214             bb[0][i] = bb[1][i] = PQCLEAN_MCELIECE8192128F_VEC_vec_setbits((b >> i) & 1);
215         }
216 
217         PQCLEAN_MCELIECE8192128F_VEC_vec_mul(B_tmp[0], dd[0], B[0]);
218         PQCLEAN_MCELIECE8192128F_VEC_vec_mul(B_tmp[1], dd[1], B[1]);
219         PQCLEAN_MCELIECE8192128F_VEC_vec_mul(C_tmp[0], bb[0], C[0]);
220         PQCLEAN_MCELIECE8192128F_VEC_vec_mul(C_tmp[1], bb[1], C[1]);
221 
222         vec_cmov(B[0], C[0], mask);
223         vec_cmov(B[1], C[1], mask);
224         update(B, c0 & mask);
225 
226         for (i = 0; i < GFBITS; i++) {
227             C[0][i] = B_tmp[0][i] ^ C_tmp[0][i];
228             C[1][i] = B_tmp[1][i] ^ C_tmp[1][i];
229         }
230 
231         c0 = (gf)(t >> 32);
232         b = (d & mask) | (b & ~mask);
233         L = ((N + 1 - L) & mask) | (L & ~mask);
234     }
235 
236     c0 = PQCLEAN_MCELIECE8192128F_VEC_gf_inv(c0);
237 
238     for (i = 0; i < GFBITS; i++) {
239         v[i] = PQCLEAN_MCELIECE8192128F_VEC_vec_setbits((c0 >> i) & 1);
240     }
241 
242     PQCLEAN_MCELIECE8192128F_VEC_vec_mul(out[0], C[0], v);
243     PQCLEAN_MCELIECE8192128F_VEC_vec_mul(out[1], C[1], v);
244 }
245 
246