1 /*
2   This file is for public-key generation
3 */
4 
5 #include "pk_gen.h"
6 
7 #include "benes.h"
8 #include "controlbits.h"
9 #include "fft.h"
10 #include "params.h"
11 #include "util.h"
12 
13 #include <stdint.h>
14 
15 #define min(a, b) (((a) < (b)) ? (a) : (b))
16 
de_bitslicing(uint64_t * out,vec256 in[][GFBITS])17 static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) {
18     int i, j, r;
19     uint64_t u = 0;
20 
21     for (i = 0; i < (1 << GFBITS); i++) {
22         out[i] = 0 ;
23     }
24 
25     for (i = 0; i < 32; i++) {
26         for (j = GFBITS - 1; j >= 0; j--) {
27             u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 0);
28             for (r = 0; r < 64; r++) {
29                 out[i * 256 + 0 * 64 + r] <<= 1;
30                 out[i * 256 + 0 * 64 + r] |= (u >> r) & 1;
31             }
32             u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 1);
33             for (r = 0; r < 64; r++) {
34                 out[i * 256 + 1 * 64 + r] <<= 1;
35                 out[i * 256 + 1 * 64 + r] |= (u >> r) & 1;
36             }
37             u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 2);
38             for (r = 0; r < 64; r++) {
39                 out[i * 256 + 2 * 64 + r] <<= 1;
40                 out[i * 256 + 2 * 64 + r] |= (u >> r) & 1;
41             }
42             u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 3);
43             for (r = 0; r < 64; r++) {
44                 out[i * 256 + 3 * 64 + r] <<= 1;
45                 out[i * 256 + 3 * 64 + r] |= (u >> r) & 1;
46             }
47         }
48     }
49 }
50 
to_bitslicing_2x(vec256 out0[][GFBITS],vec256 out1[][GFBITS],const uint64_t * in)51 static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) {
52     int i, j, k, r;
53     uint64_t u[4] = {0};
54 
55     for (i = 0; i < 32; i++) {
56         for (j = GFBITS - 1; j >= 0; j--) {
57             for (k = 0; k < 4; k++) {
58                 for (r = 63; r >= 0; r--) {
59                     u[k] <<= 1;
60                     u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1;
61                 }
62             }
63 
64             out1[i][j] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]);
65         }
66 
67         for (j = GFBITS - 1; j >= 0; j--) {
68             for (k = 0; k < 4; k++) {
69                 for (r = 63; r >= 0; r--) {
70                     u[k] <<= 1;
71                     u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1;
72                 }
73             }
74 
75             out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]);
76         }
77     }
78 }
79 
transpose_64x64(uint64_t * out,const uint64_t * in)80 static void transpose_64x64(uint64_t *out, const uint64_t *in) {
81     int i, j, s, d;
82 
83     uint64_t x, y;
84     uint64_t masks[6][2] = {
85         {0x5555555555555555, 0xAAAAAAAAAAAAAAAA},
86         {0x3333333333333333, 0xCCCCCCCCCCCCCCCC},
87         {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0},
88         {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00},
89         {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000},
90         {0x00000000FFFFFFFF, 0xFFFFFFFF00000000}
91     };
92 
93     for (i = 0; i < 64; i++) {
94         out[i] = in[i];
95     }
96 
97     for (d = 5; d >= 0; d--) {
98         s = 1 << d;
99 
100         for (i = 0; i < 64; i += s * 2) {
101             for (j = i; j < i + s; j++) {
102                 x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s);
103                 y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]);
104 
105                 out[j + 0] = x;
106                 out[j + s] = y;
107             }
108         }
109     }
110 }
111 
112 /* return number of trailing zeros of the non-zero input in */
ctz(uint64_t in)113 static inline int ctz(uint64_t in) {
114     return (int)_tzcnt_u64(in);
115 }
116 
same_mask(uint16_t x,uint16_t y)117 static inline uint64_t same_mask(uint16_t x, uint16_t y) {
118     uint64_t mask;
119 
120     mask = x ^ y;
121     mask -= 1;
122     mask >>= 63;
123     mask = -mask;
124 
125     return mask;
126 }
127 
mov_columns(uint64_t mat[][((SYS_N+255)/256)* 4],uint32_t * perm)128 static int mov_columns(uint64_t mat[][ ((SYS_N + 255) / 256) * 4 ], uint32_t *perm) {
129     int i, j, k, s, block_idx, row, tail;
130     uint64_t buf[64], ctz_list[32], t, d, mask;
131 
132     row = GFBITS * SYS_T - 32;
133     block_idx = row / 64;
134     tail = row % 64;
135 
136     // extract the 32x64 matrix
137 
138     for (i = 0; i < 32; i++) {
139         buf[i] = (mat[ row + i ][ block_idx + 0 ] >> tail) |
140                  (mat[ row + i ][ block_idx + 1 ] << (64 - tail));
141     }
142 
143     // compute the column indices of pivots by Gaussian elimination.
144     // the indices are stored in ctz_list
145 
146     for (i = 0; i < 32; i++) {
147         t = buf[i];
148         for (j = i + 1; j < 32; j++) {
149             t |= buf[j];
150         }
151 
152         if (t == 0) {
153             return -1;    // return if buf is not full rank
154         }
155 
156         ctz_list[i] = s = ctz(t);
157 
158         for (j = i + 1; j < 32; j++) {
159             mask = (buf[i] >> s) & 1;
160             mask -= 1;
161             buf[i] ^= buf[j] & mask;
162         }
163         for (j =   0; j <  i; j++) {
164             mask = (buf[j] >> s) & 1;
165             mask = -mask;
166             buf[j] ^= buf[i] & mask;
167         }
168         for (j = i + 1; j < 32; j++) {
169             mask = (buf[j] >> s) & 1;
170             mask = -mask;
171             buf[j] ^= buf[i] & mask;
172         }
173     }
174 
175     // updating permutation
176 
177     for (j = 0;   j < 32; j++) {
178         for (k = j + 1; k < 64; k++) {
179             d = perm[ row + j ] ^ perm[ row + k ];
180             d &= same_mask(k, ctz_list[j]);
181             perm[ row + j ] ^= d;
182             perm[ row + k ] ^= d;
183         }
184     }
185 
186     // moving columns of mat according to the column indices of pivots
187 
188     for (i = 0; i < GFBITS * SYS_T; i += 64) {
189 
190         for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) {
191             buf[j] = (mat[ i + j ][ block_idx + 0 ] >> tail) |
192                      (mat[ i + j ][ block_idx + 1 ] << (64 - tail));
193         }
194 
195         transpose_64x64(buf, buf);
196 
197         for (j = 0; j < 32; j++) {
198             for (k = j + 1; k < 64; k++) {
199                 d = buf[ j ] ^ buf[ k ];
200                 d &= same_mask(k, ctz_list[j]);
201                 buf[ j ] ^= d;
202                 buf[ k ] ^= d;
203             }
204         }
205 
206         transpose_64x64(buf, buf);
207 
208         for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) {
209             mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << (64 - tail) >> (64 - tail)) | (buf[j] << tail);
210             mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> tail << tail) | (buf[j] >> (64 - tail));
211         }
212     }
213 
214     return 0;
215 }
216 
217 #define NBLOCKS1_H ((SYS_N + 63) / 64)
218 #define NBLOCKS2_H ((SYS_N + 255) / 256)
219 #define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64)
PQCLEAN_MCELIECE6960119F_AVX_pk_gen(unsigned char * pk,uint32_t * perm,const unsigned char * sk)220 int PQCLEAN_MCELIECE6960119F_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) {
221     const int block_idx = NBLOCKS1_I - 1;
222     int tail = (GFBITS * SYS_T) % 64;
223 
224     int i, j, k;
225     int row, c;
226 
227     uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ];
228 
229     uint64_t mask;
230 
231     vec128 sk_int[ GFBITS ];
232 
233     vec256 consts[ 32 ][ GFBITS ];
234     vec256 eval[ 32 ][ GFBITS ];
235     vec256 prod[ 32 ][ GFBITS ];
236     vec256 tmp[ GFBITS ];
237 
238     uint64_t list[1 << GFBITS];
239     uint64_t one_row[ NBLOCKS2_H * 4 ];
240 
241     // compute the inverses
242 
243     PQCLEAN_MCELIECE6960119F_AVX_irr_load(sk_int, sk);
244 
245     PQCLEAN_MCELIECE6960119F_AVX_fft(eval, sk_int);
246 
247     PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(prod[0], eval[0]);
248 
249     for (i = 1; i < 32; i++) {
250         PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]);
251     }
252 
253     PQCLEAN_MCELIECE6960119F_AVX_vec256_inv(tmp, prod[31]);
254 
255     for (i = 30; i >= 0; i--) {
256         PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(prod[i + 1], prod[i], tmp);
257         PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(tmp, tmp, eval[i + 1]);
258     }
259 
260     PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(prod[0], tmp);
261 
262     // fill matrix
263 
264     de_bitslicing(list, prod);
265 
266     for (i = 0; i < (1 << GFBITS); i++) {
267         list[i] <<= GFBITS;
268         list[i] |= i;
269         list[i] |= ((uint64_t) perm[i]) << 31;
270     }
271 
272     PQCLEAN_MCELIECE6960119F_AVX_sort_63b(1 << GFBITS, list);
273 
274     to_bitslicing_2x(consts, prod, list);
275 
276     for (i = 0; i < (1 << GFBITS); i++) {
277         perm[i] = list[i] & GFMASK;
278     }
279 
280     for (j = 0; j < NBLOCKS2_H; j++) {
281         for (k = 0; k < GFBITS; k++) {
282             mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 0);
283             mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 1);
284             mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 2);
285             mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 3);
286         }
287     }
288 
289     for (i = 1; i < SYS_T; i++) {
290         for (j = 0; j < NBLOCKS2_H; j++) {
291             PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(prod[j], prod[j], consts[j]);
292 
293             for (k = 0; k < GFBITS; k++) {
294                 mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 0);
295                 mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 1);
296                 mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 2);
297                 mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 3);
298             }
299         }
300     }
301 
302     // gaussian elimination
303 
304     for (row = 0; row < PK_NROWS; row++) {
305         i = row >> 6;
306         j = row & 63;
307 
308         if (row == GFBITS * SYS_T - 32) {
309             if (mov_columns(mat, perm)) {
310                 return -1;
311             }
312         }
313 
314         for (k = row + 1; k < PK_NROWS; k++) {
315             mask = mat[ row ][ i ] >> j;
316             mask &= 1;
317             mask -= 1;
318 
319             for (c = 0; c < NBLOCKS1_H; c++) {
320                 mat[ row ][ c ] ^= mat[ k ][ c ] & mask;
321             }
322         }
323 
324         if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic
325             return -1;
326         }
327 
328         for (k = 0; k < row; k++) {
329             mask = mat[ k ][ i ] >> j;
330             mask &= 1;
331             mask = -mask;
332 
333             for (c = 0; c < NBLOCKS1_H; c++) {
334                 mat[ k ][ c ] ^= mat[ row ][ c ] & mask;
335             }
336         }
337 
338         for (k = row + 1; k < PK_NROWS; k++) {
339             mask = mat[ k ][ i ] >> j;
340             mask &= 1;
341             mask = -mask;
342 
343             for (c = 0; c < NBLOCKS1_H; c++) {
344                 mat[ k ][ c ] ^= mat[ row ][ c ] & mask;
345             }
346         }
347     }
348 
349     for (row = 0; row < PK_NROWS; row++) {
350         for (k = block_idx; k < NBLOCKS1_H; k++) {
351             one_row[k] = mat[ row ][k];
352         }
353 
354         for (k = block_idx; k < NBLOCKS1_H - 1; k++) {
355             one_row[k] = (one_row[k] >> tail) | (one_row[k + 1] << (64 - tail));
356             PQCLEAN_MCELIECE6960119F_AVX_store8(pk, one_row[k]);
357             pk += 8;
358         }
359 
360         one_row[k] >>= tail;
361         PQCLEAN_MCELIECE6960119F_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8);
362 
363         pk[ (PK_ROW_BYTES % 8) - 1 ] &= (1 << (PK_NCOLS % 8)) - 1; // removing redundant bits
364 
365         pk += PK_ROW_BYTES % 8;
366     }
367 
368     //
369 
370     return 0;
371 }
372 
373