1 #include <stdint.h>
2 #include <immintrin.h>
3 #include <string.h>
4 #include "align.h"
5 #include "fips202x4.h"
6 #include "params.h"
7 #include "poly.h"
8 #include "ntt.h"
9 #include "consts.h"
10 #include "reduce.h"
11 #include "cbd.h"
12 #include "symmetric.h"
13 
14 /*************************************************
15 * Name:        poly_compress
16 *
17 * Description: Compression and subsequent serialization of a polynomial.
18 *              The coefficients of the input polynomial are assumed to
19 *              lie in the invertal [0,q], i.e. the polynomial must be reduced
20 *              by poly_reduce().
21 *
22 * Arguments:   - uint8_t *r: pointer to output byte array
23 *                            (of length KYBER_POLYCOMPRESSEDBYTES)
24 *              - const poly *a: pointer to input polynomial
25 **************************************************/
26 #if (KYBER_POLYCOMPRESSEDBYTES == 96)
poly_compress(uint8_t r[96],const poly * restrict a)27 void poly_compress(uint8_t r[96], const poly * restrict a)
28 {
29   unsigned int i;
30   __m256i f0, f1, f2, f3;
31   __m128i t0, t1;
32   const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
33   const __m256i shift1 = _mm256_set1_epi16(1 << 8);
34   const __m256i mask = _mm256_set1_epi16(7);
35   const __m256i shift2 = _mm256_set1_epi16((8 << 8) + 1);
36   const __m256i shift3 = _mm256_set1_epi32((64 << 16) + 1);
37   const __m256i sllvdidx = _mm256_set1_epi64x(12LL << 32);
38   const __m256i shufbidx = _mm256_set_epi8( 8, 2, 1, 0,-1,-1,-1,-1,14,13,12, 6, 5, 4,10, 9,
39                                            -1,-1,-1,-1,14,13,12, 6, 5, 4,10, 9, 8, 2, 1, 0);
40 
41   for(i=0;i<KYBER_N/64;i++) {
42     f0 = _mm256_load_si256(&a->vec[4*i+0]);
43     f1 = _mm256_load_si256(&a->vec[4*i+1]);
44     f2 = _mm256_load_si256(&a->vec[4*i+2]);
45     f3 = _mm256_load_si256(&a->vec[4*i+3]);
46     f0 = _mm256_mulhi_epi16(f0,v);
47     f1 = _mm256_mulhi_epi16(f1,v);
48     f2 = _mm256_mulhi_epi16(f2,v);
49     f3 = _mm256_mulhi_epi16(f3,v);
50     f0 = _mm256_mulhrs_epi16(f0,shift1);
51     f1 = _mm256_mulhrs_epi16(f1,shift1);
52     f2 = _mm256_mulhrs_epi16(f2,shift1);
53     f3 = _mm256_mulhrs_epi16(f3,shift1);
54     f0 = _mm256_and_si256(f0,mask);
55     f1 = _mm256_and_si256(f1,mask);
56     f2 = _mm256_and_si256(f2,mask);
57     f3 = _mm256_and_si256(f3,mask);
58     f0 = _mm256_packus_epi16(f0,f1);
59     f2 = _mm256_packus_epi16(f2,f3);
60     f0 = _mm256_maddubs_epi16(f0,shift2);	// a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
61     f2 = _mm256_maddubs_epi16(f2,shift2);	// c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7
62     f0 = _mm256_madd_epi16(f0,shift3);		// a0 a1 b0 b1 a2 a3 b2 b3
63     f2 = _mm256_madd_epi16(f2,shift3);		// c0 c1 d0 d1 c2 c3 d2 d3
64     f0 = _mm256_sllv_epi32(f0,sllvdidx);
65     f2 = _mm256_sllv_epi32(f2,sllvdidx);
66     f0 = _mm256_hadd_epi32(f0,f2);		// a0 c0 c0 d0 a1 b1 c1 d1
67     f0 = _mm256_permute4x64_epi64(f0,0xD8);	// a0 b0 a1 b1 c0 d0 c1 d1
68     f0 = _mm256_shuffle_epi8(f0,shufbidx);
69     t0 = _mm256_castsi256_si128(f0);
70     t1 = _mm256_extracti128_si256(f0,1);
71     t0 = _mm_blend_epi32(t0,t1,0x08);
72     _mm_storeu_si128((__m128i *)&r[24*i+ 0],t0);
73     _mm_storel_epi64((__m128i *)&r[24*i+16],t1);
74   }
75 }
76 
77 /*************************************************
78 * Name:        poly_decompress
79 *
80 * Description: De-serialization and subsequent decompression of a polynomial;
81 *              approximate inverse of poly_compress
82 *
83 * Arguments:   - poly *r: pointer to output polynomial
84 *              - const uint8_t *a: pointer to input byte array
85 *                                  (of length KYBER_POLYCOMPRESSEDBYTES bytes)
86 **************************************************/
poly_decompress(poly * restrict r,const uint8_t a[96])87 void poly_decompress(poly * restrict r, const uint8_t a[96])
88 {
89   unsigned int i;
90   __m128i t;
91   __m256i f;
92   const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
93   const __m256i shufbidx = _mm256_set_epi8(5,5,5,5,5,4,4,4,4,4,4,3,3,3,3,3,
94                                            2,2,2,2,2,1,1,1,1,1,1,0,0,0,0,0);
95   const __m256i mask = _mm256_set_epi16(224,28,896,112,14,448,56,7,
96                                         224,28,896,112,14,448,56,7);
97   const __m256i shift = _mm256_set_epi16(128,1024,32,256,2048,64,512,4096,
98                                          128,1024,32,256,2048,64,512,4096);
99 
100   for(i=0;i<KYBER_N/16;i++) {
101     t = _mm_castps_si128(_mm_load_ss((float *)&a[6*i+0])));
102     t = _mm_insert_epi16(t,*(int16_t *)&a[6*i+4],2);
103     f = _mm256_broadcastsi128_si256(t);
104     f = _mm256_blend_epi16(f,g,0x);
105     f = _mm256_shuffle_epi8(f,shufbidx);
106     f = _mm256_and_si256(f,mask);
107     f = _mm256_mullo_epi16(f,shift);
108     f = _mm256_mulhrs_epi16(f,q);
109     _mm256_store_si256(&r->vec[i],f);
110   }
111 }
112 
113 #elif (KYBER_POLYCOMPRESSEDBYTES == 128)
poly_compress(uint8_t r[128],const poly * restrict a)114 void poly_compress(uint8_t r[128], const poly * restrict a)
115 {
116   unsigned int i;
117   __m256i f0, f1, f2, f3;
118   const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
119   const __m256i shift1 = _mm256_set1_epi16(1 << 9);
120   const __m256i mask = _mm256_set1_epi16(15);
121   const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1);
122   const __m256i permdidx = _mm256_set_epi32(7,3,6,2,5,1,4,0);
123 
124   for(i=0;i<KYBER_N/64;i++) {
125     f0 = _mm256_load_si256(&a->vec[4*i+0]);
126     f1 = _mm256_load_si256(&a->vec[4*i+1]);
127     f2 = _mm256_load_si256(&a->vec[4*i+2]);
128     f3 = _mm256_load_si256(&a->vec[4*i+3]);
129     f0 = _mm256_mulhi_epi16(f0,v);
130     f1 = _mm256_mulhi_epi16(f1,v);
131     f2 = _mm256_mulhi_epi16(f2,v);
132     f3 = _mm256_mulhi_epi16(f3,v);
133     f0 = _mm256_mulhrs_epi16(f0,shift1);
134     f1 = _mm256_mulhrs_epi16(f1,shift1);
135     f2 = _mm256_mulhrs_epi16(f2,shift1);
136     f3 = _mm256_mulhrs_epi16(f3,shift1);
137     f0 = _mm256_and_si256(f0,mask);
138     f1 = _mm256_and_si256(f1,mask);
139     f2 = _mm256_and_si256(f2,mask);
140     f3 = _mm256_and_si256(f3,mask);
141     f0 = _mm256_packus_epi16(f0,f1);
142     f2 = _mm256_packus_epi16(f2,f3);
143     f0 = _mm256_maddubs_epi16(f0,shift2);
144     f2 = _mm256_maddubs_epi16(f2,shift2);
145     f0 = _mm256_packus_epi16(f0,f2);
146     f0 = _mm256_permutevar8x32_epi32(f0,permdidx);
147     _mm256_storeu_si256((__m256i *)&r[32*i],f0);
148   }
149 }
150 
poly_decompress(poly * restrict r,const uint8_t a[128])151 void poly_decompress(poly * restrict r, const uint8_t a[128])
152 {
153   unsigned int i;
154   __m128i t;
155   __m256i f;
156   const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
157   const __m256i shufbidx = _mm256_set_epi8(7,7,7,7,6,6,6,6,5,5,5,5,4,4,4,4,
158                                            3,3,3,3,2,2,2,2,1,1,1,1,0,0,0,0);
159   const __m256i mask = _mm256_set1_epi32(0x00F0000F);
160   const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048);
161 
162   for(i=0;i<KYBER_N/16;i++) {
163     t = _mm_loadl_epi64((__m128i *)&a[8*i]);
164     f = _mm256_broadcastsi128_si256(t);
165     f = _mm256_shuffle_epi8(f,shufbidx);
166     f = _mm256_and_si256(f,mask);
167     f = _mm256_mullo_epi16(f,shift);
168     f = _mm256_mulhrs_epi16(f,q);
169     _mm256_store_si256(&r->vec[i],f);
170   }
171 }
172 
173 #elif (KYBER_POLYCOMPRESSEDBYTES == 160)
poly_compress(uint8_t r[160],const poly * restrict a)174 void poly_compress(uint8_t r[160], const poly * restrict a)
175 {
176   unsigned int i;
177   __m256i f0, f1;
178   __m128i t0, t1;
179   const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
180   const __m256i shift1 = _mm256_set1_epi16(1 << 10);
181   const __m256i mask = _mm256_set1_epi16(31);
182   const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1);
183   const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1);
184   const __m256i sllvdidx = _mm256_set1_epi64x(12);
185   const __m256i shufbidx = _mm256_set_epi8( 8,-1,-1,-1,-1,-1, 4, 3, 2, 1, 0,-1,12,11,10, 9,
186                                            -1,12,11,10, 9, 8,-1,-1,-1,-1,-1 ,4, 3, 2, 1, 0);
187 
188   for(i=0;i<KYBER_N/32;i++) {
189     f0 = _mm256_load_si256(&a->vec[2*i+0]);
190     f1 = _mm256_load_si256(&a->vec[2*i+1]);
191     f0 = _mm256_mulhi_epi16(f0,v);
192     f1 = _mm256_mulhi_epi16(f1,v);
193     f0 = _mm256_mulhrs_epi16(f0,shift1);
194     f1 = _mm256_mulhrs_epi16(f1,shift1);
195     f0 = _mm256_and_si256(f0,mask);
196     f1 = _mm256_and_si256(f1,mask);
197     f0 = _mm256_packus_epi16(f0,f1);
198     f0 = _mm256_maddubs_epi16(f0,shift2);	// a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
199     f0 = _mm256_madd_epi16(f0,shift3);		// a0 a1 b0 b1 a2 a3 b2 b3
200     f0 = _mm256_sllv_epi32(f0,sllvdidx);
201     f0 = _mm256_srlv_epi64(f0,sllvdidx);
202     f0 = _mm256_shuffle_epi8(f0,shufbidx);
203     t0 = _mm256_castsi256_si128(f0);
204     t1 = _mm256_extracti128_si256(f0,1);
205     t0 = _mm_blendv_epi8(t0,t1,_mm256_castsi256_si128(shufbidx));
206     _mm_storeu_si128((__m128i *)&r[20*i+ 0],t0);
207     memcpy(&r[20*i+16],&t1,4);
208   }
209 }
210 
poly_decompress(poly * restrict r,const uint8_t a[160])211 void poly_decompress(poly * restrict r, const uint8_t a[160])
212 {
213   unsigned int i;
214   __m128i t;
215   __m256i f;
216   int16_t ti;
217   const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
218   const __m256i shufbidx = _mm256_set_epi8(9,9,9,8,8,8,8,7,7,6,6,6,6,5,5,5,
219                                            4,4,4,3,3,3,3,2,2,1,1,1,1,0,0,0);
220   const __m256i mask = _mm256_set_epi16(248,1984,62,496,3968,124,992,31,
221                                         248,1984,62,496,3968,124,992,31);
222   const __m256i shift = _mm256_set_epi16(128,16,512,64,8,256,32,1024,
223                                          128,16,512,64,8,256,32,1024);
224 
225   for(i=0;i<KYBER_N/16;i++) {
226     t = _mm_loadl_epi64((__m128i *)&a[10*i+0]);
227     memcpy(&ti,&a[10*i+8],2);
228     t = _mm_insert_epi16(t,ti,4);
229     f = _mm256_broadcastsi128_si256(t);
230     f = _mm256_shuffle_epi8(f,shufbidx);
231     f = _mm256_and_si256(f,mask);
232     f = _mm256_mullo_epi16(f,shift);
233     f = _mm256_mulhrs_epi16(f,q);
234     _mm256_store_si256(&r->vec[i],f);
235   }
236 }
237 
238 #endif
239 
240 /*************************************************
241 * Name:        poly_tobytes
242 *
243 * Description: Serialization of a polynomial in NTT representation.
244 *              The coefficients of the input polynomial are assumed to
245 *              lie in the invertal [0,q], i.e. the polynomial must be reduced
246 *              by poly_reduce(). The coefficients are orderd as output by
247 *              poly_ntt(); the serialized output coefficients are in bitreversed
248 *              order.
249 *
250 * Arguments:   - uint8_t *r: pointer to output byte array
251 *                            (needs space for KYBER_POLYBYTES bytes)
252 *              - poly *a: pointer to input polynomial
253 **************************************************/
poly_tobytes(uint8_t r[KYBER_POLYBYTES],const poly * a)254 void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a)
255 {
256   ntttobytes_avx(r, a->vec, qdata.vec);
257 }
258 
259 /*************************************************
260 * Name:        poly_frombytes
261 *
262 * Description: De-serialization of a polynomial;
263 *              inverse of poly_tobytes
264 *
265 * Arguments:   - poly *r: pointer to output polynomial
266 *              - const uint8_t *a: pointer to input byte array
267 *                                  (of KYBER_POLYBYTES bytes)
268 **************************************************/
poly_frombytes(poly * r,const uint8_t a[KYBER_POLYBYTES])269 void poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES])
270 {
271   nttfrombytes_avx(r->vec, a, qdata.vec);
272 }
273 
274 /*************************************************
275 * Name:        poly_frommsg
276 *
277 * Description: Convert 32-byte message to polynomial
278 *
279 * Arguments:   - poly *r: pointer to output polynomial
280 *              - const uint8_t *msg: pointer to input message
281 **************************************************/
poly_frommsg(poly * restrict r,const uint8_t msg[KYBER_INDCPA_MSGBYTES])282 void poly_frommsg(poly * restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
283 {
284 #if (KYBER_INDCPA_MSGBYTES != 32)
285 #error "KYBER_INDCPA_MSGBYTES must be equal to 32!"
286 #endif
287   __m256i f, g0, g1, g2, g3, h0, h1, h2, h3;
288   const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0,1,2,3));
289   const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0));
290   const __m256i hqs = _mm256_set1_epi16((KYBER_Q+1)/2);
291 
292 #define FROMMSG64(i)						\
293   g3 = _mm256_shuffle_epi32(f,0x55*i);				\
294   g3 = _mm256_sllv_epi32(g3,shift);				\
295   g3 = _mm256_shuffle_epi8(g3,idx);				\
296   g0 = _mm256_slli_epi16(g3,12);				\
297   g1 = _mm256_slli_epi16(g3,8);					\
298   g2 = _mm256_slli_epi16(g3,4);					\
299   g0 = _mm256_srai_epi16(g0,15);				\
300   g1 = _mm256_srai_epi16(g1,15);				\
301   g2 = _mm256_srai_epi16(g2,15);				\
302   g3 = _mm256_srai_epi16(g3,15);				\
303   g0 = _mm256_and_si256(g0,hqs);  /* 19 18 17 16  3  2  1  0 */	\
304   g1 = _mm256_and_si256(g1,hqs);  /* 23 22 21 20  7  6  5  4 */	\
305   g2 = _mm256_and_si256(g2,hqs);  /* 27 26 25 24 11 10  9  8 */	\
306   g3 = _mm256_and_si256(g3,hqs);  /* 31 30 29 28 15 14 13 12 */	\
307   h0 = _mm256_unpacklo_epi64(g0,g1);				\
308   h2 = _mm256_unpackhi_epi64(g0,g1);				\
309   h1 = _mm256_unpacklo_epi64(g2,g3);				\
310   h3 = _mm256_unpackhi_epi64(g2,g3);				\
311   g0 = _mm256_permute2x128_si256(h0,h1,0x20);			\
312   g2 = _mm256_permute2x128_si256(h0,h1,0x31);			\
313   g1 = _mm256_permute2x128_si256(h2,h3,0x20);			\
314   g3 = _mm256_permute2x128_si256(h2,h3,0x31);			\
315   _mm256_store_si256(&r->vec[0+2*i+0],g0);	\
316   _mm256_store_si256(&r->vec[0+2*i+1],g1);	\
317   _mm256_store_si256(&r->vec[8+2*i+0],g2);	\
318   _mm256_store_si256(&r->vec[8+2*i+1],g3)
319 
320   f = _mm256_loadu_si256((__m256i *)msg);
321   FROMMSG64(0);
322   FROMMSG64(1);
323   FROMMSG64(2);
324   FROMMSG64(3);
325 }
326 
327 /*************************************************
328 * Name:        poly_tomsg
329 *
330 * Description: Convert polynomial to 32-byte message.
331 *              The coefficients of the input polynomial are assumed to
332 *              lie in the invertal [0,q], i.e. the polynomial must be reduced
333 *              by poly_reduce().
334 *
335 * Arguments:   - uint8_t *msg: pointer to output message
336 *              - poly *a: pointer to input polynomial
337 **************************************************/
poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES],const poly * restrict a)338 void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly * restrict a)
339 {
340   unsigned int i;
341   uint32_t small;
342   __m256i f0, f1, g0, g1;
343   const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1)/2);
344   const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1)/4);
345 
346   for(i=0;i<KYBER_N/32;i++) {
347     f0 = _mm256_load_si256(&a->vec[2*i+0]);
348     f1 = _mm256_load_si256(&a->vec[2*i+1]);
349     f0 = _mm256_sub_epi16(hq, f0);
350     f1 = _mm256_sub_epi16(hq, f1);
351     g0 = _mm256_srai_epi16(f0, 15);
352     g1 = _mm256_srai_epi16(f1, 15);
353     f0 = _mm256_xor_si256(f0, g0);
354     f1 = _mm256_xor_si256(f1, g1);
355     f0 = _mm256_sub_epi16(f0, hhq);
356     f1 = _mm256_sub_epi16(f1, hhq);
357     f0 = _mm256_packs_epi16(f0, f1);
358     f0 = _mm256_permute4x64_epi64(f0, 0xD8);
359     small = _mm256_movemask_epi8(f0);
360     memcpy(&msg[4*i], &small, 4);
361   }
362 }
363 
364 /*************************************************
365 * Name:        poly_getnoise_eta1
366 *
367 * Description: Sample a polynomial deterministically from a seed and a nonce,
368 *              with output polynomial close to centered binomial distribution
369 *              with parameter KYBER_ETA1
370 *
371 * Arguments:   - poly *r: pointer to output polynomial
372 *              - const uint8_t *seed: pointer to input seed
373 *                                     (of length KYBER_SYMBYTES bytes)
374 *              - uint8_t nonce: one-byte input nonce
375 **************************************************/
poly_getnoise_eta1(poly * r,const uint8_t seed[KYBER_SYMBYTES],uint8_t nonce)376 void poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
377 {
378   ALIGNED_UINT8(KYBER_ETA1*KYBER_N/4+32) buf; // +32 bytes as required by poly_cbd_eta1
379   prf(buf.coeffs, KYBER_ETA1*KYBER_N/4, seed, nonce);
380   poly_cbd_eta1(r, buf.vec);
381 }
382 
383 /*************************************************
384 * Name:        poly_getnoise_eta2
385 *
386 * Description: Sample a polynomial deterministically from a seed and a nonce,
387 *              with output polynomial close to centered binomial distribution
388 *              with parameter KYBER_ETA2
389 *
390 * Arguments:   - poly *r: pointer to output polynomial
391 *              - const uint8_t *seed: pointer to input seed
392 *                                     (of length KYBER_SYMBYTES bytes)
393 *              - uint8_t nonce: one-byte input nonce
394 **************************************************/
poly_getnoise_eta2(poly * r,const uint8_t seed[KYBER_SYMBYTES],uint8_t nonce)395 void poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce)
396 {
397   ALIGNED_UINT8(KYBER_ETA2*KYBER_N/4) buf;
398   prf(buf.coeffs, KYBER_ETA2*KYBER_N/4, seed, nonce);
399   poly_cbd_eta2(r, buf.vec);
400 }
401 
402 #ifndef KYBER_90S
403 #define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4+SHAKE256_RATE-1)/SHAKE256_RATE)
poly_getnoise_eta1_4x(poly * r0,poly * r1,poly * r2,poly * r3,const uint8_t seed[32],uint8_t nonce0,uint8_t nonce1,uint8_t nonce2,uint8_t nonce3)404 void poly_getnoise_eta1_4x(poly *r0,
405                            poly *r1,
406                            poly *r2,
407                            poly *r3,
408                            const uint8_t seed[32],
409                            uint8_t nonce0,
410                            uint8_t nonce1,
411                            uint8_t nonce2,
412                            uint8_t nonce3)
413 {
414   ALIGNED_UINT8(NOISE_NBLOCKS*SHAKE256_RATE) buf[4];
415   __m256i f;
416   shake256x4incctx state;
417 
418   f = _mm256_loadu_si256((__m256i *)seed);
419   _mm256_store_si256(buf[0].vec, f);
420   _mm256_store_si256(buf[1].vec, f);
421   _mm256_store_si256(buf[2].vec, f);
422   _mm256_store_si256(buf[3].vec, f);
423 
424   buf[0].coeffs[32] = nonce0;
425   buf[1].coeffs[32] = nonce1;
426   buf[2].coeffs[32] = nonce2;
427   buf[3].coeffs[32] = nonce3;
428 
429   shake256x4_inc_init(&state);
430   shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
431   shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
432   shake256x4_inc_ctx_release(&state);
433 
434   poly_cbd_eta1(r0, buf[0].vec);
435   poly_cbd_eta1(r1, buf[1].vec);
436   poly_cbd_eta1(r2, buf[2].vec);
437   poly_cbd_eta1(r3, buf[3].vec);
438 }
439 
440 #if KYBER_K == 2
poly_getnoise_eta1122_4x(poly * r0,poly * r1,poly * r2,poly * r3,const uint8_t seed[32],uint8_t nonce0,uint8_t nonce1,uint8_t nonce2,uint8_t nonce3)441 void poly_getnoise_eta1122_4x(poly *r0,
442                               poly *r1,
443                               poly *r2,
444                               poly *r3,
445                               const uint8_t seed[32],
446                               uint8_t nonce0,
447                               uint8_t nonce1,
448                               uint8_t nonce2,
449                               uint8_t nonce3)
450 {
451   ALIGNED_UINT8(NOISE_NBLOCKS*SHAKE256_RATE) buf[4];
452   __m256i f;
453   shake256x4incctx state;
454 
455   f = _mm256_loadu_si256((__m256i *)seed);
456   _mm256_store_si256(buf[0].vec, f);
457   _mm256_store_si256(buf[1].vec, f);
458   _mm256_store_si256(buf[2].vec, f);
459   _mm256_store_si256(buf[3].vec, f);
460 
461   buf[0].coeffs[32] = nonce0;
462   buf[1].coeffs[32] = nonce1;
463   buf[2].coeffs[32] = nonce2;
464   buf[3].coeffs[32] = nonce3;
465 
466   shake256x4_inc_init(&state);
467   shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
468   shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);
469   shake256x4_inc_ctx_release(&state);
470 
471   poly_cbd_eta1(r0, buf[0].vec);
472   poly_cbd_eta1(r1, buf[1].vec);
473   poly_cbd_eta2(r2, buf[2].vec);
474   poly_cbd_eta2(r3, buf[3].vec);
475 }
476 #endif
477 #endif
478 
479 /*************************************************
480 * Name:        poly_ntt
481 *
482 * Description: Computes negacyclic number-theoretic transform (NTT) of
483 *              a polynomial in place.
484 *              Input coefficients assumed to be in normal order,
485 *              output coefficients are in special order that is natural
486 *              for the vectorization. Input coefficients are assumed to be
487 *              bounded by q in absolute value, output coefficients are bounded
488 *              by 16118 in absolute value.
489 *
490 * Arguments:   - poly *r: pointer to in/output polynomial
491 **************************************************/
poly_ntt(poly * r)492 void poly_ntt(poly *r)
493 {
494   ntt_avx(r->vec, qdata.vec);
495 }
496 
497 /*************************************************
498 * Name:        poly_invntt_tomont
499 *
500 * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
501 *              of a polynomial in place;
502 *              Input coefficients assumed to be in special order from vectorized
503 *              forward ntt, output in normal order. Input coefficients can be
504 *              arbitrary 16-bit integers, output coefficients are bounded by 14870
505 *              in absolute value.
506 *
507 * Arguments:   - poly *a: pointer to in/output polynomial
508 **************************************************/
poly_invntt_tomont(poly * r)509 void poly_invntt_tomont(poly *r)
510 {
511   invntt_avx(r->vec, qdata.vec);
512 }
513 
poly_nttunpack(poly * r)514 void poly_nttunpack(poly *r)
515 {
516   nttunpack_avx(r->vec, qdata.vec);
517 }
518 
519 /*************************************************
520 * Name:        poly_basemul_montgomery
521 *
522 * Description: Multiplication of two polynomials in NTT domain.
523 *              One of the input polynomials needs to have coefficients
524 *              bounded by q, the other polynomial can have arbitrary
525 *              coefficients. Output coefficients are bounded by 6656.
526 *
527 * Arguments:   - poly *r: pointer to output polynomial
528 *              - const poly *a: pointer to first input polynomial
529 *              - const poly *b: pointer to second input polynomial
530 **************************************************/
poly_basemul_montgomery(poly * r,const poly * a,const poly * b)531 void poly_basemul_montgomery(poly *r, const poly *a, const poly *b)
532 {
533   basemul_avx(r->vec, a->vec, b->vec, qdata.vec);
534 }
535 
536 /*************************************************
537 * Name:        poly_tomont
538 *
539 * Description: Inplace conversion of all coefficients of a polynomial
540 *              from normal domain to Montgomery domain
541 *
542 * Arguments:   - poly *r: pointer to input/output polynomial
543 **************************************************/
poly_tomont(poly * r)544 void poly_tomont(poly *r)
545 {
546   tomont_avx(r->vec, qdata.vec);
547 }
548 
549 /*************************************************
550 * Name:        poly_reduce
551 *
552 * Description: Applies Barrett reduction to all coefficients of a polynomial
553 *              for details of the Barrett reduction see comments in reduce.c
554 *
555 * Arguments:   - poly *r: pointer to input/output polynomial
556 **************************************************/
poly_reduce(poly * r)557 void poly_reduce(poly *r)
558 {
559   reduce_avx(r->vec, qdata.vec);
560 }
561 
562 /*************************************************
563 * Name:        poly_add
564 *
565 * Description: Add two polynomials. No modular reduction
566 *              is performed.
567 *
568 * Arguments: - poly *r: pointer to output polynomial
569 *            - const poly *a: pointer to first input polynomial
570 *            - const poly *b: pointer to second input polynomial
571 **************************************************/
poly_add(poly * r,const poly * a,const poly * b)572 void poly_add(poly *r, const poly *a, const poly *b)
573 {
574   unsigned int i;
575   __m256i f0, f1;
576 
577   for(i=0;i<KYBER_N/16;i++) {
578     f0 = _mm256_load_si256(&a->vec[i]);
579     f1 = _mm256_load_si256(&b->vec[i]);
580     f0 = _mm256_add_epi16(f0, f1);
581     _mm256_store_si256(&r->vec[i], f0);
582   }
583 }
584 
585 /*************************************************
586 * Name:        poly_sub
587 *
588 * Description: Subtract two polynomials. No modular reduction
589 *              is performed.
590 *
591 * Arguments: - poly *r: pointer to output polynomial
592 *            - const poly *a: pointer to first input polynomial
593 *            - const poly *b: pointer to second input polynomial
594 **************************************************/
poly_sub(poly * r,const poly * a,const poly * b)595 void poly_sub(poly *r, const poly *a, const poly *b)
596 {
597   unsigned int i;
598   __m256i f0, f1;
599 
600   for(i=0;i<KYBER_N/16;i++) {
601     f0 = _mm256_load_si256(&a->vec[i]);
602     f1 = _mm256_load_si256(&b->vec[i]);
603     f0 = _mm256_sub_epi16(f0, f1);
604     _mm256_store_si256(&r->vec[i], f0);
605   }
606 }
607