1 /*
2 This file is for the inversion-free Berlekamp-Massey algorithm
3 see https://ieeexplore.ieee.org/document/87857
4 */
5
6 #include "bm.h"
7
8 #include "gf.h"
9 #include "params.h"
10 #include "vec128.h"
11
12 #include <stdint.h>
13
14 extern gf PQCLEAN_MCELIECE6688128_AVX_vec_reduce_asm(vec128 *);
15 extern void PQCLEAN_MCELIECE6688128_AVX_update_asm(void *, gf, int);
16
mask_nonzero(gf a)17 static inline uint16_t mask_nonzero(gf a) {
18 uint32_t ret = a;
19
20 ret -= 1;
21 ret >>= 31;
22 ret -= 1;
23
24 return ret;
25 }
26
mask_leq(uint16_t a,uint16_t b)27 static inline uint16_t mask_leq(uint16_t a, uint16_t b) {
28 uint32_t a_tmp = a;
29 uint32_t b_tmp = b;
30 uint32_t ret = b_tmp - a_tmp;
31
32 ret >>= 31;
33 ret -= 1;
34
35 return ret;
36 }
37
vec128_cmov(vec128 out[][2],uint16_t mask)38 static inline void vec128_cmov(vec128 out[][2], uint16_t mask) {
39 int i;
40
41 vec128 v0, v1;
42
43 vec128 m0 = PQCLEAN_MCELIECE6688128_AVX_vec128_set1_16b( mask);
44 vec128 m1 = PQCLEAN_MCELIECE6688128_AVX_vec128_set1_16b(~mask);
45
46 for (i = 0; i < GFBITS; i++) {
47 v0 = PQCLEAN_MCELIECE6688128_AVX_vec128_and(out[i][1], m0);
48 v1 = PQCLEAN_MCELIECE6688128_AVX_vec128_and(out[i][0], m1);
49 out[i][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_or(v0, v1);
50 }
51 }
52
interleave(vec256 * in,int idx0,int idx1,vec256 * mask,int b)53 static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) {
54 int s = 1 << b;
55
56 vec256 x, y;
57
58 x = PQCLEAN_MCELIECE6688128_AVX_vec256_or(PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx0], mask[0]),
59 PQCLEAN_MCELIECE6688128_AVX_vec256_sll_4x(PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx1], mask[0]), s));
60
61 y = PQCLEAN_MCELIECE6688128_AVX_vec256_or(PQCLEAN_MCELIECE6688128_AVX_vec256_srl_4x(PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx0], mask[1]), s),
62 PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx1], mask[1]));
63
64 in[idx0] = x;
65 in[idx1] = y;
66 }
67
68 /* input: in, field elements in bitsliced form */
69 /* output: out, field elements in non-bitsliced form */
get_coefs(gf * out,vec256 * in)70 static inline void get_coefs(gf *out, vec256 *in) {
71 int i, k;
72
73 vec256 mask[4][2];
74 vec256 buf[16];
75
76 for (i = 0; i < 13; i++) {
77 buf[i] = in[i];
78 }
79 for (i = 13; i < 16; i++) {
80 buf[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_setzero();
81 }
82
83 mask[0][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x5555);
84 mask[0][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xAAAA);
85 mask[1][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x3333);
86 mask[1][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xCCCC);
87 mask[2][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x0F0F);
88 mask[2][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xF0F0);
89 mask[3][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x00FF);
90 mask[3][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xFF00);
91
92 interleave(buf, 0, 8, mask[3], 3);
93 interleave(buf, 1, 9, mask[3], 3);
94 interleave(buf, 2, 10, mask[3], 3);
95 interleave(buf, 3, 11, mask[3], 3);
96 interleave(buf, 4, 12, mask[3], 3);
97 interleave(buf, 5, 13, mask[3], 3);
98 interleave(buf, 6, 14, mask[3], 3);
99 interleave(buf, 7, 15, mask[3], 3);
100
101 interleave(buf, 0, 4, mask[2], 2);
102 interleave(buf, 1, 5, mask[2], 2);
103 interleave(buf, 2, 6, mask[2], 2);
104 interleave(buf, 3, 7, mask[2], 2);
105 interleave(buf, 8, 12, mask[2], 2);
106 interleave(buf, 9, 13, mask[2], 2);
107 interleave(buf, 10, 14, mask[2], 2);
108 interleave(buf, 11, 15, mask[2], 2);
109
110 interleave(buf, 0, 2, mask[1], 1);
111 interleave(buf, 1, 3, mask[1], 1);
112 interleave(buf, 4, 6, mask[1], 1);
113 interleave(buf, 5, 7, mask[1], 1);
114 interleave(buf, 8, 10, mask[1], 1);
115 interleave(buf, 9, 11, mask[1], 1);
116 interleave(buf, 12, 14, mask[1], 1);
117 interleave(buf, 13, 15, mask[1], 1);
118
119 interleave(buf, 0, 1, mask[0], 0);
120 interleave(buf, 2, 3, mask[0], 0);
121 interleave(buf, 4, 5, mask[0], 0);
122 interleave(buf, 6, 7, mask[0], 0);
123 interleave(buf, 8, 9, mask[0], 0);
124 interleave(buf, 10, 11, mask[0], 0);
125 interleave(buf, 12, 13, mask[0], 0);
126 interleave(buf, 14, 15, mask[0], 0);
127
128 for (i = 0; i < 16; i++) {
129 for (k = 0; k < 4; k++) {
130 out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK;
131 out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK;
132 out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK;
133 out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK;
134 }
135 }
136 }
137
138 typedef union {
139 vec128 as_128[GFBITS][2];
140 vec256 as_256[GFBITS];
141 } aligned_double_vec128;
142
143 /* input: in, sequence of field elements */
144 /* output: out, minimal polynomial of in */
PQCLEAN_MCELIECE6688128_AVX_bm(vec128 * out,vec256 * in)145 void PQCLEAN_MCELIECE6688128_AVX_bm(vec128 *out, vec256 *in) {
146 int i;
147 uint16_t N, L;
148 uint16_t mask;
149 uint64_t one = 1, t;
150
151 vec128 prod[ GFBITS ];
152 vec128 interval[GFBITS];
153
154 aligned_double_vec128 db;
155 aligned_double_vec128 BC_tmp;
156 aligned_double_vec128 BC;
157
158 gf d, b, c0 = 1;
159 gf coefs[256];
160
161 // initialization
162
163 get_coefs(coefs, in);
164
165 BC.as_128[0][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0, one << 63);
166 BC.as_128[0][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_setzero();
167
168 for (i = 1; i < GFBITS; i++) {
169 BC.as_128[i][0] = BC.as_128[i][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_setzero();
170 }
171
172 b = 1;
173 L = 0;
174
175 //
176
177 for (i = 0; i < GFBITS; i++) {
178 interval[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_setzero();
179 }
180
181 for (N = 0; N < 256; N++) {
182 PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm(prod, interval, BC.as_128[0] + 1, 32);
183 PQCLEAN_MCELIECE6688128_AVX_update_asm(interval, coefs[N], 16);
184
185 d = PQCLEAN_MCELIECE6688128_AVX_vec_reduce_asm(prod);
186
187 t = PQCLEAN_MCELIECE6688128_AVX_gf_mul2(c0, coefs[N], b);
188 d ^= t & 0xFFFFFFFF;
189
190 mask = mask_nonzero(d) & mask_leq(L * 2, N);
191
192 for (i = 0; i < GFBITS; i++) {
193 db.as_128[i][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((d >> i) & 1);
194 db.as_128[i][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((b >> i) & 1);
195 }
196
197 PQCLEAN_MCELIECE6688128_AVX_vec256_mul(BC_tmp.as_256, db.as_256, BC.as_256);
198
199 vec128_cmov(BC.as_128, mask);
200 PQCLEAN_MCELIECE6688128_AVX_update_asm(BC.as_128, c0 & mask, 32);
201
202 for (i = 0; i < GFBITS; i++) {
203 BC.as_128[i][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(BC_tmp.as_128[i][0], BC_tmp.as_128[i][1]);
204 }
205
206 c0 = t >> 32;
207 b = (d & mask) | (b & ~mask);
208 L = ((N + 1 - L) & mask) | (L & ~mask);
209 }
210
211 c0 = PQCLEAN_MCELIECE6688128_AVX_gf_inv(c0);
212
213 for (i = 0; i < GFBITS; i++) {
214 prod[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((c0 >> i) & 1);
215 }
216
217 PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm(out, prod, BC.as_128[0] + 1, 32);
218 }
219
220