1 /*
2   This file is for the inversion-free Berlekamp-Massey algorithm
3   see https://ieeexplore.ieee.org/document/87857
4 */
5 
6 #include "bm.h"
7 
8 #include "gf.h"
9 #include "params.h"
10 #include "vec128.h"
11 
12 #include <stdint.h>
13 
14 extern gf PQCLEAN_MCELIECE6688128_AVX_vec_reduce_asm(vec128 *);
15 extern void PQCLEAN_MCELIECE6688128_AVX_update_asm(void *, gf, int);
16 
mask_nonzero(gf a)17 static inline uint16_t mask_nonzero(gf a) {
18     uint32_t ret = a;
19 
20     ret -= 1;
21     ret >>= 31;
22     ret -= 1;
23 
24     return ret;
25 }
26 
mask_leq(uint16_t a,uint16_t b)27 static inline uint16_t mask_leq(uint16_t a, uint16_t b) {
28     uint32_t a_tmp = a;
29     uint32_t b_tmp = b;
30     uint32_t ret = b_tmp - a_tmp;
31 
32     ret >>= 31;
33     ret -= 1;
34 
35     return ret;
36 }
37 
vec128_cmov(vec128 out[][2],uint16_t mask)38 static inline void vec128_cmov(vec128 out[][2], uint16_t mask) {
39     int i;
40 
41     vec128 v0, v1;
42 
43     vec128 m0 = PQCLEAN_MCELIECE6688128_AVX_vec128_set1_16b( mask);
44     vec128 m1 = PQCLEAN_MCELIECE6688128_AVX_vec128_set1_16b(~mask);
45 
46     for (i = 0; i < GFBITS; i++) {
47         v0 = PQCLEAN_MCELIECE6688128_AVX_vec128_and(out[i][1], m0);
48         v1 = PQCLEAN_MCELIECE6688128_AVX_vec128_and(out[i][0], m1);
49         out[i][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_or(v0, v1);
50     }
51 }
52 
interleave(vec256 * in,int idx0,int idx1,vec256 * mask,int b)53 static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) {
54     int s = 1 << b;
55 
56     vec256 x, y;
57 
58     x = PQCLEAN_MCELIECE6688128_AVX_vec256_or(PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx0], mask[0]),
59             PQCLEAN_MCELIECE6688128_AVX_vec256_sll_4x(PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx1], mask[0]), s));
60 
61     y = PQCLEAN_MCELIECE6688128_AVX_vec256_or(PQCLEAN_MCELIECE6688128_AVX_vec256_srl_4x(PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx0], mask[1]), s),
62             PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx1], mask[1]));
63 
64     in[idx0] = x;
65     in[idx1] = y;
66 }
67 
68 /* input: in, field elements in bitsliced form */
69 /* output: out, field elements in non-bitsliced form */
get_coefs(gf * out,vec256 * in)70 static inline void get_coefs(gf *out, vec256 *in) {
71     int i, k;
72 
73     vec256 mask[4][2];
74     vec256 buf[16];
75 
76     for (i =  0; i < 13; i++) {
77         buf[i] = in[i];
78     }
79     for (i = 13; i < 16; i++) {
80         buf[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_setzero();
81     }
82 
83     mask[0][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x5555);
84     mask[0][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xAAAA);
85     mask[1][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x3333);
86     mask[1][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xCCCC);
87     mask[2][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x0F0F);
88     mask[2][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xF0F0);
89     mask[3][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x00FF);
90     mask[3][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xFF00);
91 
92     interleave(buf,  0,  8, mask[3], 3);
93     interleave(buf,  1,  9, mask[3], 3);
94     interleave(buf,  2, 10, mask[3], 3);
95     interleave(buf,  3, 11, mask[3], 3);
96     interleave(buf,  4, 12, mask[3], 3);
97     interleave(buf,  5, 13, mask[3], 3);
98     interleave(buf,  6, 14, mask[3], 3);
99     interleave(buf,  7, 15, mask[3], 3);
100 
101     interleave(buf,  0,  4, mask[2], 2);
102     interleave(buf,  1,  5, mask[2], 2);
103     interleave(buf,  2,  6, mask[2], 2);
104     interleave(buf,  3,  7, mask[2], 2);
105     interleave(buf,  8, 12, mask[2], 2);
106     interleave(buf,  9, 13, mask[2], 2);
107     interleave(buf, 10, 14, mask[2], 2);
108     interleave(buf, 11, 15, mask[2], 2);
109 
110     interleave(buf,  0,  2, mask[1], 1);
111     interleave(buf,  1,  3, mask[1], 1);
112     interleave(buf,  4,  6, mask[1], 1);
113     interleave(buf,  5,  7, mask[1], 1);
114     interleave(buf,  8, 10, mask[1], 1);
115     interleave(buf,  9, 11, mask[1], 1);
116     interleave(buf, 12, 14, mask[1], 1);
117     interleave(buf, 13, 15, mask[1], 1);
118 
119     interleave(buf,  0,  1, mask[0], 0);
120     interleave(buf,  2,  3, mask[0], 0);
121     interleave(buf,  4,  5, mask[0], 0);
122     interleave(buf,  6,  7, mask[0], 0);
123     interleave(buf,  8,  9, mask[0], 0);
124     interleave(buf, 10, 11, mask[0], 0);
125     interleave(buf, 12, 13, mask[0], 0);
126     interleave(buf, 14, 15, mask[0], 0);
127 
128     for (i = 0; i < 16; i++) {
129         for (k = 0; k <  4; k++) {
130             out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK;
131             out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK;
132             out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK;
133             out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK;
134         }
135     }
136 }
137 
138 typedef union {
139     vec128 as_128[GFBITS][2];
140     vec256 as_256[GFBITS];
141 } aligned_double_vec128;
142 
143 /* input: in, sequence of field elements */
144 /* output: out, minimal polynomial of in */
PQCLEAN_MCELIECE6688128_AVX_bm(vec128 * out,vec256 * in)145 void PQCLEAN_MCELIECE6688128_AVX_bm(vec128 *out, vec256 *in) {
146     int i;
147     uint16_t N, L;
148     uint16_t mask;
149     uint64_t one = 1, t;
150 
151     vec128 prod[ GFBITS ];
152     vec128 interval[GFBITS];
153 
154     aligned_double_vec128 db;
155     aligned_double_vec128 BC_tmp;
156     aligned_double_vec128 BC;
157 
158     gf d, b, c0 = 1;
159     gf coefs[256];
160 
161     // initialization
162 
163     get_coefs(coefs, in);
164 
165     BC.as_128[0][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0, one << 63);
166     BC.as_128[0][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_setzero();
167 
168     for (i = 1; i < GFBITS; i++) {
169         BC.as_128[i][0] = BC.as_128[i][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_setzero();
170     }
171 
172     b = 1;
173     L = 0;
174 
175     //
176 
177     for (i = 0; i < GFBITS; i++) {
178         interval[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_setzero();
179     }
180 
181     for (N = 0; N < 256; N++) {
182         PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm(prod, interval, BC.as_128[0] + 1, 32);
183         PQCLEAN_MCELIECE6688128_AVX_update_asm(interval, coefs[N], 16);
184 
185         d = PQCLEAN_MCELIECE6688128_AVX_vec_reduce_asm(prod);
186 
187         t = PQCLEAN_MCELIECE6688128_AVX_gf_mul2(c0, coefs[N], b);
188         d ^= t & 0xFFFFFFFF;
189 
190         mask = mask_nonzero(d) & mask_leq(L * 2, N);
191 
192         for (i = 0; i < GFBITS; i++) {
193             db.as_128[i][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((d >> i) & 1);
194             db.as_128[i][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((b >> i) & 1);
195         }
196 
197         PQCLEAN_MCELIECE6688128_AVX_vec256_mul(BC_tmp.as_256, db.as_256, BC.as_256);
198 
199         vec128_cmov(BC.as_128, mask);
200         PQCLEAN_MCELIECE6688128_AVX_update_asm(BC.as_128, c0 & mask, 32);
201 
202         for (i = 0; i < GFBITS; i++) {
203             BC.as_128[i][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(BC_tmp.as_128[i][0], BC_tmp.as_128[i][1]);
204         }
205 
206         c0 = t >> 32;
207         b = (d & mask) | (b & ~mask);
208         L = ((N + 1 - L) & mask) | (L & ~mask);
209     }
210 
211     c0 = PQCLEAN_MCELIECE6688128_AVX_gf_inv(c0);
212 
213     for (i = 0; i < GFBITS; i++) {
214         prod[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((c0 >> i) & 1);
215     }
216 
217     PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm(out, prod, BC.as_128[0] + 1, 32);
218 }
219 
220