1 #include <stdint.h>
2 #include <immintrin.h>
3 #include <string.h>
4 #include "align.h"
5 #include "fips202x4.h"
6 #include "params.h"
7 #include "poly.h"
8 #include "ntt.h"
9 #include "consts.h"
10 #include "reduce.h"
11 #include "cbd.h"
12 #include "symmetric.h"
13
14 /*************************************************
15 * Name: poly_compress
16 *
17 * Description: Compression and subsequent serialization of a polynomial.
18 * The coefficients of the input polynomial are assumed to
19 * lie in the invertal [0,q], i.e. the polynomial must be reduced
20 * by poly_reduce().
21 *
22 * Arguments: - uint8_t *r: pointer to output byte array
23 * (of length KYBER_POLYCOMPRESSEDBYTES)
24 * - const poly *a: pointer to input polynomial
25 **************************************************/
26 #if (KYBER_POLYCOMPRESSEDBYTES == 96)
poly_compress(uint8_t r[96],const poly * restrict a)27 void poly_compress(uint8_t r[96], const poly * restrict a)
28 {
29 unsigned int i;
30 __m256i f0, f1, f2, f3;
31 __m128i t0, t1;
32 const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
33 const __m256i shift1 = _mm256_set1_epi16(1 << 8);
34 const __m256i mask = _mm256_set1_epi16(7);
35 const __m256i shift2 = _mm256_set1_epi16((8 << 8) + 1);
36 const __m256i shift3 = _mm256_set1_epi32((64 << 16) + 1);
37 const __m256i sllvdidx = _mm256_set1_epi64x(12LL << 32);
38 const __m256i shufbidx = _mm256_set_epi8( 8, 2, 1, 0,-1,-1,-1,-1,14,13,12, 6, 5, 4,10, 9,
39 -1,-1,-1,-1,14,13,12, 6, 5, 4,10, 9, 8, 2, 1, 0);
40
41 for(i=0;i<KYBER_N/64;i++) {
42 f0 = _mm256_load_si256(&a->vec[4*i+0]);
43 f1 = _mm256_load_si256(&a->vec[4*i+1]);
44 f2 = _mm256_load_si256(&a->vec[4*i+2]);
45 f3 = _mm256_load_si256(&a->vec[4*i+3]);
46 f0 = _mm256_mulhi_epi16(f0,v);
47 f1 = _mm256_mulhi_epi16(f1,v);
48 f2 = _mm256_mulhi_epi16(f2,v);
49 f3 = _mm256_mulhi_epi16(f3,v);
50 f0 = _mm256_mulhrs_epi16(f0,shift1);
51 f1 = _mm256_mulhrs_epi16(f1,shift1);
52 f2 = _mm256_mulhrs_epi16(f2,shift1);
53 f3 = _mm256_mulhrs_epi16(f3,shift1);
54 f0 = _mm256_and_si256(f0,mask);
55 f1 = _mm256_and_si256(f1,mask);
56 f2 = _mm256_and_si256(f2,mask);
57 f3 = _mm256_and_si256(f3,mask);
58 f0 = _mm256_packus_epi16(f0,f1);
59 f2 = _mm256_packus_epi16(f2,f3);
60 f0 = _mm256_maddubs_epi16(f0,shift2); // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
61 f2 = _mm256_maddubs_epi16(f2,shift2); // c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7
62 f0 = _mm256_madd_epi16(f0,shift3); // a0 a1 b0 b1 a2 a3 b2 b3
63 f2 = _mm256_madd_epi16(f2,shift3); // c0 c1 d0 d1 c2 c3 d2 d3
64 f0 = _mm256_sllv_epi32(f0,sllvdidx);
65 f2 = _mm256_sllv_epi32(f2,sllvdidx);
66 f0 = _mm256_hadd_epi32(f0,f2); // a0 c0 c0 d0 a1 b1 c1 d1
67 f0 = _mm256_permute4x64_epi64(f0,0xD8); // a0 b0 a1 b1 c0 d0 c1 d1
68 f0 = _mm256_shuffle_epi8(f0,shufbidx);
69 t0 = _mm256_castsi256_si128(f0);
70 t1 = _mm256_extracti128_si256(f0,1);
71 t0 = _mm_blend_epi32(t0,t1,0x08);
72 _mm_storeu_si128((__m128i *)&r[24*i+ 0],t0);
73 _mm_storel_epi64((__m128i *)&r[24*i+16],t1);
74 }
75 }
76
77 /*************************************************
78 * Name: poly_decompress
79 *
80 * Description: De-serialization and subsequent decompression of a polynomial;
81 * approximate inverse of poly_compress
82 *
83 * Arguments: - poly *r: pointer to output polynomial
84 * - const uint8_t *a: pointer to input byte array
85 * (of length KYBER_POLYCOMPRESSEDBYTES bytes)
86 **************************************************/
poly_decompress(poly * restrict r,const uint8_t a[96])87 void poly_decompress(poly * restrict r, const uint8_t a[96])
88 {
89 unsigned int i;
90 __m128i t;
91 __m256i f;
92 const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
93 const __m256i shufbidx = _mm256_set_epi8(5,5,5,5,5,4,4,4,4,4,4,3,3,3,3,3,
94 2,2,2,2,2,1,1,1,1,1,1,0,0,0,0,0);
95 const __m256i mask = _mm256_set_epi16(224,28,896,112,14,448,56,7,
96 224,28,896,112,14,448,56,7);
97 const __m256i shift = _mm256_set_epi16(128,1024,32,256,2048,64,512,4096,
98 128,1024,32,256,2048,64,512,4096);
99
100 for(i=0;i<KYBER_N/16;i++) {
101 t = _mm_castps_si128(_mm_load_ss((float *)&a[6*i+0])));
102 t = _mm_insert_epi16(t,*(int16_t *)&a[6*i+4],2);
103 f = _mm256_broadcastsi128_si256(t);
104 f = _mm256_blend_epi16(f,g,0x);
105 f = _mm256_shuffle_epi8(f,shufbidx);
106 f = _mm256_and_si256(f,mask);
107 f = _mm256_mullo_epi16(f,shift);
108 f = _mm256_mulhrs_epi16(f,q);
109 _mm256_store_si256(&r->vec[i],f);
110 }
111 }
112
113 #elif (KYBER_POLYCOMPRESSEDBYTES == 128)
poly_compress(uint8_t r[128],const poly * restrict a)114 void poly_compress(uint8_t r[128], const poly * restrict a)
115 {
116 unsigned int i;
117 __m256i f0, f1, f2, f3;
118 const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
119 const __m256i shift1 = _mm256_set1_epi16(1 << 9);
120 const __m256i mask = _mm256_set1_epi16(15);
121 const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1);
122 const __m256i permdidx = _mm256_set_epi32(7,3,6,2,5,1,4,0);
123
124 for(i=0;i<KYBER_N/64;i++) {
125 f0 = _mm256_load_si256(&a->vec[4*i+0]);
126 f1 = _mm256_load_si256(&a->vec[4*i+1]);
127 f2 = _mm256_load_si256(&a->vec[4*i+2]);
128 f3 = _mm256_load_si256(&a->vec[4*i+3]);
129 f0 = _mm256_mulhi_epi16(f0,v);
130 f1 = _mm256_mulhi_epi16(f1,v);
131 f2 = _mm256_mulhi_epi16(f2,v);
132 f3 = _mm256_mulhi_epi16(f3,v);
133 f0 = _mm256_mulhrs_epi16(f0,shift1);
134 f1 = _mm256_mulhrs_epi16(f1,shift1);
135 f2 = _mm256_mulhrs_epi16(f2,shift1);
136 f3 = _mm256_mulhrs_epi16(f3,shift1);
137 f0 = _mm256_and_si256(f0,mask);
138 f1 = _mm256_and_si256(f1,mask);
139 f2 = _mm256_and_si256(f2,mask);
140 f3 = _mm256_and_si256(f3,mask);
141 f0 = _mm256_packus_epi16(f0,f1);
142 f2 = _mm256_packus_epi16(f2,f3);
143 f0 = _mm256_maddubs_epi16(f0,shift2);
144 f2 = _mm256_maddubs_epi16(f2,shift2);
145 f0 = _mm256_packus_epi16(f0,f2);
146 f0 = _mm256_permutevar8x32_epi32(f0,permdidx);
147 _mm256_storeu_si256((__m256i *)&r[32*i],f0);
148 }
149 }
150
poly_decompress(poly * restrict r,const uint8_t a[128])151 void poly_decompress(poly * restrict r, const uint8_t a[128])
152 {
153 unsigned int i;
154 __m128i t;
155 __m256i f;
156 const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
157 const __m256i shufbidx = _mm256_set_epi8(7,7,7,7,6,6,6,6,5,5,5,5,4,4,4,4,
158 3,3,3,3,2,2,2,2,1,1,1,1,0,0,0,0);
159 const __m256i mask = _mm256_set1_epi32(0x00F0000F);
160 const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048);
161
162 for(i=0;i<KYBER_N/16;i++) {
163 t = _mm_loadl_epi64((__m128i *)&a[8*i]);
164 f = _mm256_broadcastsi128_si256(t);
165 f = _mm256_shuffle_epi8(f,shufbidx);
166 f = _mm256_and_si256(f,mask);
167 f = _mm256_mullo_epi16(f,shift);
168 f = _mm256_mulhrs_epi16(f,q);
169 _mm256_store_si256(&r->vec[i],f);
170 }
171 }
172
173 #elif (KYBER_POLYCOMPRESSEDBYTES == 160)
poly_compress(uint8_t r[160],const poly * restrict a)174 void poly_compress(uint8_t r[160], const poly * restrict a)
175 {
176 unsigned int i;
177 __m256i f0, f1;
178 __m128i t0, t1;
179 const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
180 const __m256i shift1 = _mm256_set1_epi16(1 << 10);
181 const __m256i mask = _mm256_set1_epi16(31);
182 const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1);
183 const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1);
184 const __m256i sllvdidx = _mm256_set1_epi64x(12);
185 const __m256i shufbidx = _mm256_set_epi8( 8,-1,-1,-1,-1,-1, 4, 3, 2, 1, 0,-1,12,11,10, 9,
186 -1,12,11,10, 9, 8,-1,-1,-1,-1,-1 ,4, 3, 2, 1, 0);
187
188 for(i=0;i<KYBER_N/32;i++) {
189 f0 = _mm256_load_si256(&a->vec[2*i+0]);
190 f1 = _mm256_load_si256(&a->vec[2*i+1]);
191 f0 = _mm256_mulhi_epi16(f0,v);
192 f1 = _mm256_mulhi_epi16(f1,v);
193 f0 = _mm256_mulhrs_epi16(f0,shift1);
194 f1 = _mm256_mulhrs_epi16(f1,shift1);
195 f0 = _mm256_and_si256(f0,mask);
196 f1 = _mm256_and_si256(f1,mask);
197 f0 = _mm256_packus_epi16(f0,f1);
198 f0 = _mm256_maddubs_epi16(f0,shift2); // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
199 f0 = _mm256_madd_epi16(f0,shift3); // a0 a1 b0 b1 a2 a3 b2 b3
200 f0 = _mm256_sllv_epi32(f0,sllvdidx);
201 f0 = _mm256_srlv_epi64(f0,sllvdidx);
202 f0 = _mm256_shuffle_epi8(f0,shufbidx);
203 t0 = _mm256_castsi256_si128(f0);
204 t1 = _mm256_extracti128_si256(f0,1);
205 t0 = _mm_blendv_epi8(t0,t1,_mm256_castsi256_si128(shufbidx));
206 _mm_storeu_si128((__m128i *)&r[20*i+ 0],t0);
207 memcpy(&r[20*i+16],&t1,4);
208 }
209 }
210
poly_decompress(poly * restrict r,const uint8_t a[160])211 void poly_decompress(poly * restrict r, const uint8_t a[160])
212 {
213 unsigned int i;
214 __m128i t;
215 __m256i f;
216 int16_t ti;
217 const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
218 const __m256i shufbidx = _mm256_set_epi8(9,9,9,8,8,8,8,7,7,6,6,6,6,5,5,5,
219 4,4,4,3,3,3,3,2,2,1,1,1,1,0,0,0);
220 const __m256i mask = _mm256_set_epi16(248,1984,62,496,3968,124,992,31,
221 248,1984,62,496,3968,124,992,31);
222 const __m256i shift = _mm256_set_epi16(128,16,512,64,8,256,32,1024,
223 128,16,512,64,8,256,32,1024);
224
225 for(i=0;i<KYBER_N/16;i++) {
226 t = _mm_loadl_epi64((__m128i *)&a[10*i+0]);
227 memcpy(&ti,&a[10*i+8],2);
228 t = _mm_insert_epi16(t,ti,4);
229 f = _mm256_broadcastsi128_si256(t);
230 f = _mm256_shuffle_epi8(f,shufbidx);
231 f = _mm256_and_si256(f,mask);
232 f = _mm256_mullo_epi16(f,shift);
233 f = _mm256_mulhrs_epi16(f,q);
234 _mm256_store_si256(&r->vec[i],f);
235 }
236 }
237
238 #endif
239
240 /*************************************************
241 * Name: poly_tobytes
242 *
243 * Description: Serialization of a polynomial in NTT representation.
244 * The coefficients of the input polynomial are assumed to
245 * lie in the invertal [0,q], i.e. the polynomial must be reduced
246 * by poly_reduce(). The coefficients are orderd as output by
247 * poly_ntt(); the serialized output coefficients are in bitreversed
248 * order.
249 *
250 * Arguments: - uint8_t *r: pointer to output byte array
251 * (needs space for KYBER_POLYBYTES bytes)
252 * - poly *a: pointer to input polynomial
253 **************************************************/
poly_tobytes(uint8_t r[KYBER_POLYBYTES],const poly * a)254 void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a)
255 {
256 ntttobytes_avx(r, a->vec, qdata.vec);
257 }
258
259 /*************************************************
260 * Name: poly_frombytes
261 *
262 * Description: De-serialization of a polynomial;
263 * inverse of poly_tobytes
264 *
265 * Arguments: - poly *r: pointer to output polynomial
266 * - const uint8_t *a: pointer to input byte array
267 * (of KYBER_POLYBYTES bytes)
268 **************************************************/
poly_frombytes(poly * r,const uint8_t a[KYBER_POLYBYTES])269 void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES])
270 {
271 nttfrombytes_avx(r->vec, a, qdata.vec);
272 }
273
274 /*************************************************
275 * Name: poly_frommsg
276 *
277 * Description: Convert 32-byte message to polynomial
278 *
279 * Arguments: - poly *r: pointer to output polynomial
280 * - const uint8_t *msg: pointer to input message
281 **************************************************/
poly_frommsg(poly * restrict r,const uint8_t msg[KYBER_INDCPA_MSGBYTES])282 void poly_frommsg(poly * restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
283 {
284 #if (KYBER_INDCPA_MSGBYTES != 32)
285 #error "KYBER_INDCPA_MSGBYTES must be equal to 32!"
286 #endif
287 __m256i f, g0, g1, g2, g3, h0, h1, h2, h3;
288 const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0,1,2,3));
289 const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0));
290 const __m256i hqs = _mm256_set1_epi16((KYBER_Q+1)/2);
291
292 #define FROMMSG64(i) \
293 g3 = _mm256_shuffle_epi32(f,0x55*i); \
294 g3 = _mm256_sllv_epi32(g3,shift); \
295 g3 = _mm256_shuffle_epi8(g3,idx); \
296 g0 = _mm256_slli_epi16(g3,12); \
297 g1 = _mm256_slli_epi16(g3,8); \
298 g2 = _mm256_slli_epi16(g3,4); \
299 g0 = _mm256_srai_epi16(g0,15); \
300 g1 = _mm256_srai_epi16(g1,15); \
301 g2 = _mm256_srai_epi16(g2,15); \
302 g3 = _mm256_srai_epi16(g3,15); \
303 g0 = _mm256_and_si256(g0,hqs); /* 19 18 17 16 3 2 1 0 */ \
304 g1 = _mm256_and_si256(g1,hqs); /* 23 22 21 20 7 6 5 4 */ \
305 g2 = _mm256_and_si256(g2,hqs); /* 27 26 25 24 11 10 9 8 */ \
306 g3 = _mm256_and_si256(g3,hqs); /* 31 30 29 28 15 14 13 12 */ \
307 h0 = _mm256_unpacklo_epi64(g0,g1); \
308 h2 = _mm256_unpackhi_epi64(g0,g1); \
309 h1 = _mm256_unpacklo_epi64(g2,g3); \
310 h3 = _mm256_unpackhi_epi64(g2,g3); \
311 g0 = _mm256_permute2x128_si256(h0,h1,0x20); \
312 g2 = _mm256_permute2x128_si256(h0,h1,0x31); \
313 g1 = _mm256_permute2x128_si256(h2,h3,0x20); \
314 g3 = _mm256_permute2x128_si256(h2,h3,0x31); \
315 _mm256_store_si256(&r->vec[0+2*i+0],g0); \
316 _mm256_store_si256(&r->vec[0+2*i+1],g1); \
317 _mm256_store_si256(&r->vec[8+2*i+0],g2); \
318 _mm256_store_si256(&r->vec[8+2*i+1],g3)
319
320 f = _mm256_loadu_si256((__m256i *)msg);
321 FROMMSG64(0);
322 FROMMSG64(1);
323 FROMMSG64(2);
324 FROMMSG64(3);
325 }
326
327 /*************************************************
328 * Name: poly_tomsg
329 *
330 * Description: Convert polynomial to 32-byte message.
331 * The coefficients of the input polynomial are assumed to
332 * lie in the invertal [0,q], i.e. the polynomial must be reduced
333 * by poly_reduce().
334 *
335 * Arguments: - uint8_t *msg: pointer to output message
336 * - poly *a: pointer to input polynomial
337 **************************************************/
poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES],const poly * restrict a)338 void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly * restrict a)
339 {
340 unsigned int i;
341 uint32_t small;
342 __m256i f0, f1, g0, g1;
343 const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1)/2);
344 const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1)/4);
345
346 for(i=0;i<KYBER_N/32;i++) {
347 f0 = _mm256_load_si256(&a->vec[2*i+0]);
348 f1 = _mm256_load_si256(&a->vec[2*i+1]);
349 f0 = _mm256_sub_epi16(hq, f0);
350 f1 = _mm256_sub_epi16(hq, f1);
351 g0 = _mm256_srai_epi16(f0, 15);
352 g1 = _mm256_srai_epi16(f1, 15);
353 f0 = _mm256_xor_si256(f0, g0);
354 f1 = _mm256_xor_si256(f1, g1);
355 f0 = _mm256_sub_epi16(f0, hhq);
356 f1 = _mm256_sub_epi16(f1, hhq);
357 f0 = _mm256_packs_epi16(f0, f1);
358 f0 = _mm256_permute4x64_epi64(f0, 0xD8);
359 small = _mm256_movemask_epi8(f0);
360 memcpy(&msg[4*i], &small, 4);
361 }
362 }
363
364 /*************************************************
365 * Name: poly_getnoise_eta1
366 *
367 * Description: Sample a polynomial deterministically from a seed and a nonce,
368 * with output polynomial close to centered binomial distribution
369 * with parameter KYBER_ETA1
370 *
371 * Arguments: - poly *r: pointer to output polynomial
372 * - const uint8_t *seed: pointer to input seed
373 * (of length KYBER_SYMBYTES bytes)
374 * - uint8_t nonce: one-byte input nonce
375 **************************************************/
poly_getnoise_eta1(poly * r,const uint8_t seed[KYBER_SYMBYTES],uint8_t nonce)376 void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
377 {
378 ALIGNED_UINT8(KYBER_ETA1*KYBER_N/4+32) buf; // +32 bytes as required by poly_cbd_eta1
379 prf(buf.coeffs, KYBER_ETA1*KYBER_N/4, seed, nonce);
380 poly_cbd_eta1(r, buf.vec);
381 }
382
383 /*************************************************
384 * Name: poly_getnoise_eta2
385 *
386 * Description: Sample a polynomial deterministically from a seed and a nonce,
387 * with output polynomial close to centered binomial distribution
388 * with parameter KYBER_ETA2
389 *
390 * Arguments: - poly *r: pointer to output polynomial
391 * - const uint8_t *seed: pointer to input seed
392 * (of length KYBER_SYMBYTES bytes)
393 * - uint8_t nonce: one-byte input nonce
394 **************************************************/
poly_getnoise_eta2(poly * r,const uint8_t seed[KYBER_SYMBYTES],uint8_t nonce)395 void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
396 {
397 ALIGNED_UINT8(KYBER_ETA2*KYBER_N/4) buf;
398 prf(buf.coeffs, KYBER_ETA2*KYBER_N/4, seed, nonce);
399 poly_cbd_eta2(r, buf.vec);
400 }
401
402 #ifndef KYBER_90S
403 #define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4+SHAKE256_RATE-1)/SHAKE256_RATE)
poly_getnoise_eta1_4x(poly * r0,poly * r1,poly * r2,poly * r3,const uint8_t seed[32],uint8_t nonce0,uint8_t nonce1,uint8_t nonce2,uint8_t nonce3)404 void poly_getnoise_eta1_4x(poly *r0,
405 poly *r1,
406 poly *r2,
407 poly *r3,
408 const uint8_t seed[32],
409 uint8_t nonce0,
410 uint8_t nonce1,
411 uint8_t nonce2,
412 uint8_t nonce3)
413 {
414 ALIGNED_UINT8(NOISE_NBLOCKS*SHAKE256_RATE) buf[4];
415 __m256i f;
416 shake256x4incctx state;
417
418 f = _mm256_loadu_si256((__m256i *)seed);
419 _mm256_store_si256(buf[0].vec, f);
420 _mm256_store_si256(buf[1].vec, f);
421 _mm256_store_si256(buf[2].vec, f);
422 _mm256_store_si256(buf[3].vec, f);
423
424 buf[0].coeffs[32] = nonce0;
425 buf[1].coeffs[32] = nonce1;
426 buf[2].coeffs[32] = nonce2;
427 buf[3].coeffs[32] = nonce3;
428
429 shake256x4_inc_init(&state);
430 shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
431 shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
432 shake256x4_inc_ctx_release(&state);
433
434 poly_cbd_eta1(r0, buf[0].vec);
435 poly_cbd_eta1(r1, buf[1].vec);
436 poly_cbd_eta1(r2, buf[2].vec);
437 poly_cbd_eta1(r3, buf[3].vec);
438 }
439
440 #if KYBER_K == 2
poly_getnoise_eta1122_4x(poly * r0,poly * r1,poly * r2,poly * r3,const uint8_t seed[32],uint8_t nonce0,uint8_t nonce1,uint8_t nonce2,uint8_t nonce3)441 void poly_getnoise_eta1122_4x(poly *r0,
442 poly *r1,
443 poly *r2,
444 poly *r3,
445 const uint8_t seed[32],
446 uint8_t nonce0,
447 uint8_t nonce1,
448 uint8_t nonce2,
449 uint8_t nonce3)
450 {
451 ALIGNED_UINT8(NOISE_NBLOCKS*SHAKE256_RATE) buf[4];
452 __m256i f;
453 shake256x4incctx state;
454
455 f = _mm256_loadu_si256((__m256i *)seed);
456 _mm256_store_si256(buf[0].vec, f);
457 _mm256_store_si256(buf[1].vec, f);
458 _mm256_store_si256(buf[2].vec, f);
459 _mm256_store_si256(buf[3].vec, f);
460
461 buf[0].coeffs[32] = nonce0;
462 buf[1].coeffs[32] = nonce1;
463 buf[2].coeffs[32] = nonce2;
464 buf[3].coeffs[32] = nonce3;
465
466 shake256x4_inc_init(&state);
467 shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
468 shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
469 shake256x4_inc_ctx_release(&state);
470
471 poly_cbd_eta1(r0, buf[0].vec);
472 poly_cbd_eta1(r1, buf[1].vec);
473 poly_cbd_eta2(r2, buf[2].vec);
474 poly_cbd_eta2(r3, buf[3].vec);
475 }
476 #endif
477 #endif
478
479 /*************************************************
480 * Name: poly_ntt
481 *
482 * Description: Computes negacyclic number-theoretic transform (NTT) of
483 * a polynomial in place.
484 * Input coefficients assumed to be in normal order,
485 * output coefficients are in special order that is natural
486 * for the vectorization. Input coefficients are assumed to be
487 * bounded by q in absolute value, output coefficients are bounded
488 * by 16118 in absolute value.
489 *
490 * Arguments: - poly *r: pointer to in/output polynomial
491 **************************************************/
poly_ntt(poly * r)492 void poly_ntt(poly *r)
493 {
494 ntt_avx(r->vec, qdata.vec);
495 }
496
497 /*************************************************
498 * Name: poly_invntt_tomont
499 *
500 * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
501 * of a polynomial in place;
502 * Input coefficients assumed to be in special order from vectorized
503 * forward ntt, output in normal order. Input coefficients can be
504 * arbitrary 16-bit integers, output coefficients are bounded by 14870
505 * in absolute value.
506 *
507 * Arguments: - poly *a: pointer to in/output polynomial
508 **************************************************/
poly_invntt_tomont(poly * r)509 void poly_invntt_tomont(poly *r)
510 {
511 invntt_avx(r->vec, qdata.vec);
512 }
513
poly_nttunpack(poly * r)514 void poly_nttunpack(poly *r)
515 {
516 nttunpack_avx(r->vec, qdata.vec);
517 }
518
519 /*************************************************
520 * Name: poly_basemul_montgomery
521 *
522 * Description: Multiplication of two polynomials in NTT domain.
523 * One of the input polynomials needs to have coefficients
524 * bounded by q, the other polynomial can have arbitrary
525 * coefficients. Output coefficients are bounded by 6656.
526 *
527 * Arguments: - poly *r: pointer to output polynomial
528 * - const poly *a: pointer to first input polynomial
529 * - const poly *b: pointer to second input polynomial
530 **************************************************/
poly_basemul_montgomery(poly * r,const poly * a,const poly * b)531 void poly_basemul_montgomery(poly *r, const poly *a, const poly *b)
532 {
533 basemul_avx(r->vec, a->vec, b->vec, qdata.vec);
534 }
535
536 /*************************************************
537 * Name: poly_tomont
538 *
539 * Description: Inplace conversion of all coefficients of a polynomial
540 * from normal domain to Montgomery domain
541 *
542 * Arguments: - poly *r: pointer to input/output polynomial
543 **************************************************/
poly_tomont(poly * r)544 void poly_tomont(poly *r)
545 {
546 tomont_avx(r->vec, qdata.vec);
547 }
548
549 /*************************************************
550 * Name: poly_reduce
551 *
552 * Description: Applies Barrett reduction to all coefficients of a polynomial
553 * for details of the Barrett reduction see comments in reduce.c
554 *
555 * Arguments: - poly *r: pointer to input/output polynomial
556 **************************************************/
poly_reduce(poly * r)557 void poly_reduce(poly *r)
558 {
559 reduce_avx(r->vec, qdata.vec);
560 }
561
562 /*************************************************
563 * Name: poly_add
564 *
565 * Description: Add two polynomials. No modular reduction
566 * is performed.
567 *
568 * Arguments: - poly *r: pointer to output polynomial
569 * - const poly *a: pointer to first input polynomial
570 * - const poly *b: pointer to second input polynomial
571 **************************************************/
poly_add(poly * r,const poly * a,const poly * b)572 void poly_add(poly *r, const poly *a, const poly *b)
573 {
574 unsigned int i;
575 __m256i f0, f1;
576
577 for(i=0;i<KYBER_N/16;i++) {
578 f0 = _mm256_load_si256(&a->vec[i]);
579 f1 = _mm256_load_si256(&b->vec[i]);
580 f0 = _mm256_add_epi16(f0, f1);
581 _mm256_store_si256(&r->vec[i], f0);
582 }
583 }
584
585 /*************************************************
586 * Name: poly_sub
587 *
588 * Description: Subtract two polynomials. No modular reduction
589 * is performed.
590 *
591 * Arguments: - poly *r: pointer to output polynomial
592 * - const poly *a: pointer to first input polynomial
593 * - const poly *b: pointer to second input polynomial
594 **************************************************/
poly_sub(poly * r,const poly * a,const poly * b)595 void poly_sub(poly *r, const poly *a, const poly *b)
596 {
597 unsigned int i;
598 __m256i f0, f1;
599
600 for(i=0;i<KYBER_N/16;i++) {
601 f0 = _mm256_load_si256(&a->vec[i]);
602 f1 = _mm256_load_si256(&b->vec[i]);
603 f0 = _mm256_sub_epi16(f0, f1);
604 _mm256_store_si256(&r->vec[i], f0);
605 }
606 }
607