1 /*
2 This file is for public-key generation
3 */
4
5 #include "pk_gen.h"
6
7 #include "benes.h"
8 #include "controlbits.h"
9 #include "fft.h"
10 #include "params.h"
11 #include "util.h"
12
13 #include <stdint.h>
14
15 #define min(a, b) (((a) < (b)) ? (a) : (b))
16
de_bitslicing(uint64_t * out,vec256 in[][GFBITS])17 static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) {
18 int i, j, r;
19 uint64_t u = 0;
20
21 for (i = 0; i < (1 << GFBITS); i++) {
22 out[i] = 0 ;
23 }
24
25 for (i = 0; i < 32; i++) {
26 for (j = GFBITS - 1; j >= 0; j--) {
27 u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 0);
28 for (r = 0; r < 64; r++) {
29 out[i * 256 + 0 * 64 + r] <<= 1;
30 out[i * 256 + 0 * 64 + r] |= (u >> r) & 1;
31 }
32 u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 1);
33 for (r = 0; r < 64; r++) {
34 out[i * 256 + 1 * 64 + r] <<= 1;
35 out[i * 256 + 1 * 64 + r] |= (u >> r) & 1;
36 }
37 u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 2);
38 for (r = 0; r < 64; r++) {
39 out[i * 256 + 2 * 64 + r] <<= 1;
40 out[i * 256 + 2 * 64 + r] |= (u >> r) & 1;
41 }
42 u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 3);
43 for (r = 0; r < 64; r++) {
44 out[i * 256 + 3 * 64 + r] <<= 1;
45 out[i * 256 + 3 * 64 + r] |= (u >> r) & 1;
46 }
47 }
48 }
49 }
50
to_bitslicing_2x(vec256 out0[][GFBITS],vec256 out1[][GFBITS],const uint64_t * in)51 static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) {
52 int i, j, k, r;
53 uint64_t u[4] = {0};
54
55 for (i = 0; i < 32; i++) {
56 for (j = GFBITS - 1; j >= 0; j--) {
57 for (k = 0; k < 4; k++) {
58 for (r = 63; r >= 0; r--) {
59 u[k] <<= 1;
60 u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1;
61 }
62 }
63
64 out1[i][j] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]);
65 }
66
67 for (j = GFBITS - 1; j >= 0; j--) {
68 for (k = 0; k < 4; k++) {
69 for (r = 63; r >= 0; r--) {
70 u[k] <<= 1;
71 u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1;
72 }
73 }
74
75 out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]);
76 }
77 }
78 }
79
transpose_64x64(uint64_t * out,const uint64_t * in)80 static void transpose_64x64(uint64_t *out, const uint64_t *in) {
81 int i, j, s, d;
82
83 uint64_t x, y;
84 uint64_t masks[6][2] = {
85 {0x5555555555555555, 0xAAAAAAAAAAAAAAAA},
86 {0x3333333333333333, 0xCCCCCCCCCCCCCCCC},
87 {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0},
88 {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00},
89 {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000},
90 {0x00000000FFFFFFFF, 0xFFFFFFFF00000000}
91 };
92
93 for (i = 0; i < 64; i++) {
94 out[i] = in[i];
95 }
96
97 for (d = 5; d >= 0; d--) {
98 s = 1 << d;
99
100 for (i = 0; i < 64; i += s * 2) {
101 for (j = i; j < i + s; j++) {
102 x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s);
103 y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]);
104
105 out[j + 0] = x;
106 out[j + s] = y;
107 }
108 }
109 }
110 }
111
112 /* return number of trailing zeros of the non-zero input in */
ctz(uint64_t in)113 static inline int ctz(uint64_t in) {
114 return (int)_tzcnt_u64(in);
115 }
116
same_mask(uint16_t x,uint16_t y)117 static inline uint64_t same_mask(uint16_t x, uint16_t y) {
118 uint64_t mask;
119
120 mask = x ^ y;
121 mask -= 1;
122 mask >>= 63;
123 mask = -mask;
124
125 return mask;
126 }
127
mov_columns(uint64_t mat[][((SYS_N+255)/256)* 4],uint32_t * perm)128 static int mov_columns(uint64_t mat[][ ((SYS_N + 255) / 256) * 4 ], uint32_t *perm) {
129 int i, j, k, s, block_idx, row, tail;
130 uint64_t buf[64], ctz_list[32], t, d, mask;
131
132 row = GFBITS * SYS_T - 32;
133 block_idx = row / 64;
134 tail = row % 64;
135
136 // extract the 32x64 matrix
137
138 for (i = 0; i < 32; i++) {
139 buf[i] = (mat[ row + i ][ block_idx + 0 ] >> tail) |
140 (mat[ row + i ][ block_idx + 1 ] << (64 - tail));
141 }
142
143 // compute the column indices of pivots by Gaussian elimination.
144 // the indices are stored in ctz_list
145
146 for (i = 0; i < 32; i++) {
147 t = buf[i];
148 for (j = i + 1; j < 32; j++) {
149 t |= buf[j];
150 }
151
152 if (t == 0) {
153 return -1; // return if buf is not full rank
154 }
155
156 ctz_list[i] = s = ctz(t);
157
158 for (j = i + 1; j < 32; j++) {
159 mask = (buf[i] >> s) & 1;
160 mask -= 1;
161 buf[i] ^= buf[j] & mask;
162 }
163 for (j = 0; j < i; j++) {
164 mask = (buf[j] >> s) & 1;
165 mask = -mask;
166 buf[j] ^= buf[i] & mask;
167 }
168 for (j = i + 1; j < 32; j++) {
169 mask = (buf[j] >> s) & 1;
170 mask = -mask;
171 buf[j] ^= buf[i] & mask;
172 }
173 }
174
175 // updating permutation
176
177 for (j = 0; j < 32; j++) {
178 for (k = j + 1; k < 64; k++) {
179 d = perm[ row + j ] ^ perm[ row + k ];
180 d &= same_mask(k, ctz_list[j]);
181 perm[ row + j ] ^= d;
182 perm[ row + k ] ^= d;
183 }
184 }
185
186 // moving columns of mat according to the column indices of pivots
187
188 for (i = 0; i < GFBITS * SYS_T; i += 64) {
189
190 for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) {
191 buf[j] = (mat[ i + j ][ block_idx + 0 ] >> tail) |
192 (mat[ i + j ][ block_idx + 1 ] << (64 - tail));
193 }
194
195 transpose_64x64(buf, buf);
196
197 for (j = 0; j < 32; j++) {
198 for (k = j + 1; k < 64; k++) {
199 d = buf[ j ] ^ buf[ k ];
200 d &= same_mask(k, ctz_list[j]);
201 buf[ j ] ^= d;
202 buf[ k ] ^= d;
203 }
204 }
205
206 transpose_64x64(buf, buf);
207
208 for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) {
209 mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << (64 - tail) >> (64 - tail)) | (buf[j] << tail);
210 mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> tail << tail) | (buf[j] >> (64 - tail));
211 }
212 }
213
214 return 0;
215 }
216
217 #define NBLOCKS1_H ((SYS_N + 63) / 64)
218 #define NBLOCKS2_H ((SYS_N + 255) / 256)
219 #define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64)
PQCLEAN_MCELIECE6960119F_AVX_pk_gen(unsigned char * pk,uint32_t * perm,const unsigned char * sk)220 int PQCLEAN_MCELIECE6960119F_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) {
221 const int block_idx = NBLOCKS1_I - 1;
222 int tail = (GFBITS * SYS_T) % 64;
223
224 int i, j, k;
225 int row, c;
226
227 uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ];
228
229 uint64_t mask;
230
231 vec128 sk_int[ GFBITS ];
232
233 vec256 consts[ 32 ][ GFBITS ];
234 vec256 eval[ 32 ][ GFBITS ];
235 vec256 prod[ 32 ][ GFBITS ];
236 vec256 tmp[ GFBITS ];
237
238 uint64_t list[1 << GFBITS];
239 uint64_t one_row[ NBLOCKS2_H * 4 ];
240
241 // compute the inverses
242
243 PQCLEAN_MCELIECE6960119F_AVX_irr_load(sk_int, sk);
244
245 PQCLEAN_MCELIECE6960119F_AVX_fft(eval, sk_int);
246
247 PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(prod[0], eval[0]);
248
249 for (i = 1; i < 32; i++) {
250 PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]);
251 }
252
253 PQCLEAN_MCELIECE6960119F_AVX_vec256_inv(tmp, prod[31]);
254
255 for (i = 30; i >= 0; i--) {
256 PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(prod[i + 1], prod[i], tmp);
257 PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(tmp, tmp, eval[i + 1]);
258 }
259
260 PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(prod[0], tmp);
261
262 // fill matrix
263
264 de_bitslicing(list, prod);
265
266 for (i = 0; i < (1 << GFBITS); i++) {
267 list[i] <<= GFBITS;
268 list[i] |= i;
269 list[i] |= ((uint64_t) perm[i]) << 31;
270 }
271
272 PQCLEAN_MCELIECE6960119F_AVX_sort_63b(1 << GFBITS, list);
273
274 to_bitslicing_2x(consts, prod, list);
275
276 for (i = 0; i < (1 << GFBITS); i++) {
277 perm[i] = list[i] & GFMASK;
278 }
279
280 for (j = 0; j < NBLOCKS2_H; j++) {
281 for (k = 0; k < GFBITS; k++) {
282 mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 0);
283 mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 1);
284 mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 2);
285 mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 3);
286 }
287 }
288
289 for (i = 1; i < SYS_T; i++) {
290 for (j = 0; j < NBLOCKS2_H; j++) {
291 PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(prod[j], prod[j], consts[j]);
292
293 for (k = 0; k < GFBITS; k++) {
294 mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 0);
295 mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 1);
296 mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 2);
297 mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 3);
298 }
299 }
300 }
301
302 // gaussian elimination
303
304 for (row = 0; row < PK_NROWS; row++) {
305 i = row >> 6;
306 j = row & 63;
307
308 if (row == GFBITS * SYS_T - 32) {
309 if (mov_columns(mat, perm)) {
310 return -1;
311 }
312 }
313
314 for (k = row + 1; k < PK_NROWS; k++) {
315 mask = mat[ row ][ i ] >> j;
316 mask &= 1;
317 mask -= 1;
318
319 for (c = 0; c < NBLOCKS1_H; c++) {
320 mat[ row ][ c ] ^= mat[ k ][ c ] & mask;
321 }
322 }
323
324 if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic
325 return -1;
326 }
327
328 for (k = 0; k < row; k++) {
329 mask = mat[ k ][ i ] >> j;
330 mask &= 1;
331 mask = -mask;
332
333 for (c = 0; c < NBLOCKS1_H; c++) {
334 mat[ k ][ c ] ^= mat[ row ][ c ] & mask;
335 }
336 }
337
338 for (k = row + 1; k < PK_NROWS; k++) {
339 mask = mat[ k ][ i ] >> j;
340 mask &= 1;
341 mask = -mask;
342
343 for (c = 0; c < NBLOCKS1_H; c++) {
344 mat[ k ][ c ] ^= mat[ row ][ c ] & mask;
345 }
346 }
347 }
348
349 for (row = 0; row < PK_NROWS; row++) {
350 for (k = block_idx; k < NBLOCKS1_H; k++) {
351 one_row[k] = mat[ row ][k];
352 }
353
354 for (k = block_idx; k < NBLOCKS1_H - 1; k++) {
355 one_row[k] = (one_row[k] >> tail) | (one_row[k + 1] << (64 - tail));
356 PQCLEAN_MCELIECE6960119F_AVX_store8(pk, one_row[k]);
357 pk += 8;
358 }
359
360 one_row[k] >>= tail;
361 PQCLEAN_MCELIECE6960119F_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8);
362
363 pk[ (PK_ROW_BYTES % 8) - 1 ] &= (1 << (PK_NCOLS % 8)) - 1; // removing redundant bits
364
365 pk += PK_ROW_BYTES % 8;
366 }
367
368 //
369
370 return 0;
371 }
372
373