1 
2 /*
3  * AES256-GCM, based on the "Intel Carry-Less Multiplication Instruction and its Usage for Computing
4  * the GCM Mode" paper and reference code, using the aggregated reduction method.
5  * Originally adapted by Romain Dolbeau.
6  */
7 
8 #include <errno.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <string.h>
12 
13 #include "core.h"
14 #include "crypto_aead_aes256gcm.h"
15 #include "export.h"
16 #include "private/common.h"
17 #include "private/sse2_64_32.h"
18 #include "randombytes.h"
19 #include "runtime.h"
20 #include "utils.h"
21 
22 #if defined(HAVE_TMMINTRIN_H) && defined(HAVE_WMMINTRIN_H)
23 
24 # ifdef __GNUC__
25 #  pragma GCC target("ssse3")
26 #  pragma GCC target("aes")
27 #  pragma GCC target("pclmul")
28 # endif
29 
30 #include <tmmintrin.h>
31 #include <wmmintrin.h>
32 
33 #ifndef ENOSYS
34 # define ENOSYS ENXIO
35 #endif
36 
37 #if defined(__INTEL_COMPILER) || defined(_bswap64)
38 #elif defined(_MSC_VER)
39 # define _bswap64(a) _byteswap_uint64(a)
40 #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
41 # define _bswap64(a) __builtin_bswap64(a)
42 #else
43 static inline uint64_t
44 _bswap64(const uint64_t x)
45 {
46     return
47         ((x << 56) & 0xFF00000000000000UL) | ((x << 40) & 0x00FF000000000000UL) |
48         ((x << 24) & 0x0000FF0000000000UL) | ((x <<  8) & 0x000000FF00000000UL) |
49         ((x >>  8) & 0x00000000FF000000UL) | ((x >> 24) & 0x0000000000FF0000UL) |
50         ((x >> 40) & 0x000000000000FF00UL) | ((x >> 56) & 0x00000000000000FFUL);
51 }
52 #endif
53 
54 typedef struct context {
55     CRYPTO_ALIGN(16) unsigned char H[16];
56     __m128i          rkeys[16];
57 } context;
58 
59 static inline void
60 aesni_key256_expand(const unsigned char *key, __m128i * const rkeys)
61 {
62     __m128i  X0, X1, X2, X3;
63     int      i = 0;
64 
65     X0 = _mm_loadu_si128((const __m128i *) &key[0]);
66     rkeys[i++] = X0;
67 
68     X2 = _mm_loadu_si128((const __m128i *) &key[16]);
69     rkeys[i++] = X2;
70 
71 #define EXPAND_KEY_1(S) do { \
72     X1 = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(X2, (S)), 0xff); \
73     X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X0), 0x10)); \
74     X0 = _mm_xor_si128(X0, X3); \
75     X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X0), 0x8c)); \
76     X0 = _mm_xor_si128(_mm_xor_si128(X0, X3), X1); \
77     rkeys[i++] = X0; \
78 } while (0)
79 
80 #define EXPAND_KEY_2(S) do { \
81     X1 = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(X0, (S)), 0xaa); \
82     X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X2), 0x10)); \
83     X2 = _mm_xor_si128(X2, X3); \
84     X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X2), 0x8c)); \
85     X2 = _mm_xor_si128(_mm_xor_si128(X2, X3), X1); \
86     rkeys[i++] = X2; \
87 } while (0)
88 
89     X3 = _mm_setzero_si128();
90     EXPAND_KEY_1(0x01); EXPAND_KEY_2(0x01);
91     EXPAND_KEY_1(0x02); EXPAND_KEY_2(0x02);
92     EXPAND_KEY_1(0x04); EXPAND_KEY_2(0x04);
93     EXPAND_KEY_1(0x08); EXPAND_KEY_2(0x08);
94     EXPAND_KEY_1(0x10); EXPAND_KEY_2(0x10);
95     EXPAND_KEY_1(0x20); EXPAND_KEY_2(0x20);
96     EXPAND_KEY_1(0x40);
97 }
98 
99 /** single, by-the-book AES encryption with AES-NI */
100 static inline void
101 aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys)
102 {
103     __m128i temp = _mm_xor_si128(nv, rkeys[0]);
104 
105     temp = _mm_aesenc_si128(temp, rkeys[1]);
106     temp = _mm_aesenc_si128(temp, rkeys[2]);
107     temp = _mm_aesenc_si128(temp, rkeys[3]);
108     temp = _mm_aesenc_si128(temp, rkeys[4]);
109     temp = _mm_aesenc_si128(temp, rkeys[5]);
110     temp = _mm_aesenc_si128(temp, rkeys[6]);
111     temp = _mm_aesenc_si128(temp, rkeys[7]);
112     temp = _mm_aesenc_si128(temp, rkeys[8]);
113     temp = _mm_aesenc_si128(temp, rkeys[9]);
114     temp = _mm_aesenc_si128(temp, rkeys[10]);
115     temp = _mm_aesenc_si128(temp, rkeys[11]);
116     temp = _mm_aesenc_si128(temp, rkeys[12]);
117     temp = _mm_aesenc_si128(temp, rkeys[13]);
118 
119     temp = _mm_aesenclast_si128(temp, rkeys[14]);
120     _mm_storeu_si128((__m128i *) out, temp);
121 }
122 
123 /** multiple-blocks-at-once AES encryption with AES-NI ;
124     on Haswell, aesenc has a latency of 7 and a throughput of 1
125     so the sequence of aesenc should be bubble-free if you
126     have at least 8 blocks. Let's build an arbitratry-sized
127     function */
128 /* Step 1 : loading the nonce */
129 /* load & increment the n vector (non-vectorized, unused for now) */
130 #define NVDECLx(a)                                                             \
131     __m128i nv##a
132 
133 #define NVx(a)                                                                 \
134     nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) n), pt);         \
135     n[3]++
136 
137 /* Step 2 : define value in round one (xor with subkey #0, aka key) */
138 #define TEMPDECLx(a) \
139     __m128i temp##a
140 
141 #define TEMPx(a) \
142     temp##a = _mm_xor_si128(nv##a, rkeys[0])
143 
144 /* Step 3: one round of AES */
145 #define AESENCx(a) \
146     temp##a = _mm_aesenc_si128(temp##a, rkeys[roundctr])
147 
148 /* Step 4: last round of AES */
149 #define AESENCLASTx(a) \
150     temp##a = _mm_aesenclast_si128(temp##a, rkeys[14])
151 
152 /* Step 5: store result */
153 #define STOREx(a) \
154     _mm_storeu_si128((__m128i *) (out + (a * 16)), temp##a)
155 
156 /* all the MAKE* macros are for automatic explicit unrolling */
157 #define MAKE4(X) \
158     X(0);        \
159     X(1);        \
160     X(2);        \
161     X(3)
162 
163 #define MAKE8(X) \
164     X(0);        \
165     X(1);        \
166     X(2);        \
167     X(3);        \
168     X(4);        \
169     X(5);        \
170     X(6);        \
171     X(7)
172 
173 #define COUNTER_INC2(N) (N)[3] += 2
174 
175 /* create a function of unrolling N ; the MAKEN is the unrolling
176    macro, defined above. The N in MAKEN must match N, obviously. */
177 #define FUNC(N, MAKEN)                                                                                \
178     static inline void aesni_encrypt##N(unsigned char *out, uint32_t *n, const __m128i *rkeys)        \
179     {                                                                                                 \
180         const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);        \
181         int           roundctr;                                                                       \
182         MAKEN(NVDECLx);                                                                               \
183         MAKEN(TEMPDECLx);                                                                             \
184                                                                                                       \
185         MAKEN(NVx);                                                                                   \
186         MAKEN(TEMPx);                                                                                 \
187         for (roundctr = 1; roundctr < 14; roundctr++) {                                               \
188             MAKEN(AESENCx);                                                                           \
189         }                                                                                             \
190         MAKEN(AESENCLASTx);                                                                           \
191         MAKEN(STOREx);                                                                                \
192     }
193 
194 FUNC(8, MAKE8)
195 
196 /* all GF(2^128) fnctions are by the book, meaning this one:
197    <https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
198 */
199 
200 static inline void
201 addmul(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsigned char *b)
202 {
203     const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
204     __m128i       A, B, C;
205     __m128i       tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
206     __m128i       tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp18;
207     __m128i       tmp19, tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
208     __m128i       tmp28, tmp29, tmp30, tmp31, tmp32, tmp33, tmp34, tmp35, tmp36;
209 
210     if (xlen >= 16) {
211         A = _mm_loadu_si128((const __m128i *) a);
212     } else {
213         CRYPTO_ALIGN(16) unsigned char padded[16];
214         unsigned int i;
215 
216         memset(padded, 0, 16);
217         for (i = 0; i < xlen; i++) {
218             padded[i] = a[i];
219         }
220         A = _mm_load_si128((const __m128i *) padded);
221     }
222     A = _mm_shuffle_epi8(A, rev);
223     B = _mm_loadu_si128((const __m128i *) b);
224     C = _mm_loadu_si128((const __m128i *) c);
225     A = _mm_xor_si128(A, C);
226     tmp3 = _mm_clmulepi64_si128(A, B, 0x00);
227     tmp4 = _mm_clmulepi64_si128(A, B, 0x10);
228     tmp5 = _mm_clmulepi64_si128(A, B, 0x01);
229     tmp6 = _mm_clmulepi64_si128(A, B, 0x11);
230     tmp10 = _mm_xor_si128(tmp4, tmp5);
231     tmp13 = _mm_slli_si128(tmp10, 8);
232     tmp11 = _mm_srli_si128(tmp10, 8);
233     tmp15 = _mm_xor_si128(tmp3, tmp13);
234     tmp17 = _mm_xor_si128(tmp6, tmp11);
235     tmp7 = _mm_srli_epi32(tmp15, 31);
236     tmp8 = _mm_srli_epi32(tmp17, 31);
237     tmp16 = _mm_slli_epi32(tmp15, 1);
238     tmp18 = _mm_slli_epi32(tmp17, 1);
239     tmp9 = _mm_srli_si128(tmp7, 12);
240     tmp22 = _mm_slli_si128(tmp8, 4);
241     tmp25 = _mm_slli_si128(tmp7, 4);
242     tmp29 = _mm_or_si128(tmp16, tmp25);
243     tmp19 = _mm_or_si128(tmp18, tmp22);
244     tmp20 = _mm_or_si128(tmp19, tmp9);
245     tmp26 = _mm_slli_epi32(tmp29, 31);
246     tmp23 = _mm_slli_epi32(tmp29, 30);
247     tmp32 = _mm_slli_epi32(tmp29, 25);
248     tmp27 = _mm_xor_si128(tmp26, tmp23);
249     tmp28 = _mm_xor_si128(tmp27, tmp32);
250     tmp24 = _mm_srli_si128(tmp28, 4);
251     tmp33 = _mm_slli_si128(tmp28, 12);
252     tmp30 = _mm_xor_si128(tmp29, tmp33);
253     tmp2 = _mm_srli_epi32(tmp30, 1);
254     tmp12 = _mm_srli_epi32(tmp30, 2);
255     tmp14 = _mm_srli_epi32(tmp30, 7);
256     tmp34 = _mm_xor_si128(tmp2, tmp12);
257     tmp35 = _mm_xor_si128(tmp34, tmp14);
258     tmp36 = _mm_xor_si128(tmp35, tmp24);
259     tmp31 = _mm_xor_si128(tmp30, tmp36);
260     tmp21 = _mm_xor_si128(tmp20, tmp31);
261     _mm_storeu_si128((__m128i *) c, tmp21);
262 }
263 
264 /* pure multiplication, for pre-computing powers of H */
265 static inline __m128i
266 mulv(__m128i A, __m128i B)
267 {
268     __m128i tmp3 = _mm_clmulepi64_si128(A, B, 0x00);
269     __m128i tmp4 = _mm_clmulepi64_si128(A, B, 0x10);
270     __m128i tmp5 = _mm_clmulepi64_si128(A, B, 0x01);
271     __m128i tmp6 = _mm_clmulepi64_si128(A, B, 0x11);
272     __m128i tmp10 = _mm_xor_si128(tmp4, tmp5);
273     __m128i tmp13 = _mm_slli_si128(tmp10, 8);
274     __m128i tmp11 = _mm_srli_si128(tmp10, 8);
275     __m128i tmp15 = _mm_xor_si128(tmp3, tmp13);
276     __m128i tmp17 = _mm_xor_si128(tmp6, tmp11);
277     __m128i tmp7 = _mm_srli_epi32(tmp15, 31);
278     __m128i tmp8 = _mm_srli_epi32(tmp17, 31);
279     __m128i tmp16 = _mm_slli_epi32(tmp15, 1);
280     __m128i tmp18 = _mm_slli_epi32(tmp17, 1);
281     __m128i tmp9 = _mm_srli_si128(tmp7, 12);
282     __m128i tmp22 = _mm_slli_si128(tmp8, 4);
283     __m128i tmp25 = _mm_slli_si128(tmp7, 4);
284     __m128i tmp29 = _mm_or_si128(tmp16, tmp25);
285     __m128i tmp19 = _mm_or_si128(tmp18, tmp22);
286     __m128i tmp20 = _mm_or_si128(tmp19, tmp9);
287     __m128i tmp26 = _mm_slli_epi32(tmp29, 31);
288     __m128i tmp23 = _mm_slli_epi32(tmp29, 30);
289     __m128i tmp32 = _mm_slli_epi32(tmp29, 25);
290     __m128i tmp27 = _mm_xor_si128(tmp26, tmp23);
291     __m128i tmp28 = _mm_xor_si128(tmp27, tmp32);
292     __m128i tmp24 = _mm_srli_si128(tmp28, 4);
293     __m128i tmp33 = _mm_slli_si128(tmp28, 12);
294     __m128i tmp30 = _mm_xor_si128(tmp29, tmp33);
295     __m128i tmp2 = _mm_srli_epi32(tmp30, 1);
296     __m128i tmp12 = _mm_srli_epi32(tmp30, 2);
297     __m128i tmp14 = _mm_srli_epi32(tmp30, 7);
298     __m128i tmp34 = _mm_xor_si128(tmp2, tmp12);
299     __m128i tmp35 = _mm_xor_si128(tmp34, tmp14);
300     __m128i tmp36 = _mm_xor_si128(tmp35, tmp24);
301     __m128i tmp31 = _mm_xor_si128(tmp30, tmp36);
302     __m128i C = _mm_xor_si128(tmp20, tmp31);
303 
304     return C;
305 }
306 
307 /* 4 multiply-accumulate at once; again
308    <https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
309    for the Aggregated Reduction Method & sample code.
310    Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */
311 
312 #define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B
313 #define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev)
314 #define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00)
315 #define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11)
316 #define RED_MUL_MID(a)                          \
317     tmp##a = _mm_shuffle_epi32(H##a, 0x4e);     \
318     tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e);  \
319     tmp##a = _mm_xor_si128(tmp##a, H##a);       \
320     tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \
321     tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00)
322 
323 #define MULREDUCE4(rev, H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, accv) \
324 do { \
325     MAKE4(RED_DECL); \
326     __m128i lo, hi; \
327     __m128i tmp8, tmp9; \
328     __m128i H0 = H0_; \
329     __m128i H1 = H1_; \
330     __m128i H2 = H2_; \
331     __m128i H3 = H3_; \
332     __m128i X0 = X0_; \
333     __m128i X1 = X1_; \
334     __m128i X2 = X2_; \
335     __m128i X3 = X3_; \
336 \
337 /* byte-revert the inputs & xor the first one into the accumulator */ \
338 \
339     MAKE4(RED_SHUFFLE); \
340     X3 = _mm_xor_si128(X3, accv); \
341 \
342 /* 4 low H*X (x0*h0) */ \
343 \
344     MAKE4(RED_MUL_LOW); \
345     lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); \
346     lo = _mm_xor_si128(lo, H2_X2_lo); \
347     lo = _mm_xor_si128(lo, H3_X3_lo); \
348 \
349 /* 4 high H*X (x1*h1) */ \
350 \
351     MAKE4(RED_MUL_HIGH); \
352     hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); \
353     hi = _mm_xor_si128(hi, H2_X2_hi); \
354     hi = _mm_xor_si128(hi, H3_X3_hi); \
355 \
356 /* 4 middle H*X, using Karatsuba, i.e. \
357      x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 \
358      we already have all x1y1 & x0y0 (accumulated in hi & lo) \
359      (0 is low half and 1 is high half) \
360   */ \
361 /* permute the high and low 64 bits in H1 & X1, \
362      so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), \
363      then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), \
364      and finally multiply \
365   */ \
366     MAKE4(RED_MUL_MID); \
367 \
368 /* substracts x1*h1 and x0*h0 */ \
369     tmp0 = _mm_xor_si128(tmp0, lo); \
370     tmp0 = _mm_xor_si128(tmp0, hi); \
371     tmp0 = _mm_xor_si128(tmp1, tmp0); \
372     tmp0 = _mm_xor_si128(tmp2, tmp0); \
373     tmp0 = _mm_xor_si128(tmp3, tmp0);\
374 \
375     /* reduction */ \
376     tmp0B = _mm_slli_si128(tmp0, 8); \
377     tmp0 = _mm_srli_si128(tmp0, 8); \
378     lo = _mm_xor_si128(tmp0B, lo); \
379     hi = _mm_xor_si128(tmp0, hi); \
380     tmp3 = lo; \
381     tmp2B = hi; \
382     tmp3B = _mm_srli_epi32(tmp3, 31); \
383     tmp8 = _mm_srli_epi32(tmp2B, 31); \
384     tmp3 = _mm_slli_epi32(tmp3, 1); \
385     tmp2B = _mm_slli_epi32(tmp2B, 1); \
386     tmp9 = _mm_srli_si128(tmp3B, 12); \
387     tmp8 = _mm_slli_si128(tmp8, 4); \
388     tmp3B = _mm_slli_si128(tmp3B, 4); \
389     tmp3 = _mm_or_si128(tmp3, tmp3B); \
390     tmp2B = _mm_or_si128(tmp2B, tmp8); \
391     tmp2B = _mm_or_si128(tmp2B, tmp9); \
392     tmp3B = _mm_slli_epi32(tmp3, 31); \
393     tmp8 = _mm_slli_epi32(tmp3, 30); \
394     tmp9 = _mm_slli_epi32(tmp3, 25); \
395     tmp3B = _mm_xor_si128(tmp3B, tmp8); \
396     tmp3B = _mm_xor_si128(tmp3B, tmp9); \
397     tmp8 = _mm_srli_si128(tmp3B, 4); \
398     tmp3B = _mm_slli_si128(tmp3B, 12); \
399     tmp3 = _mm_xor_si128(tmp3, tmp3B); \
400     tmp2 = _mm_srli_epi32(tmp3, 1); \
401     tmp0B = _mm_srli_epi32(tmp3, 2); \
402     tmp1B = _mm_srli_epi32(tmp3, 7); \
403     tmp2 = _mm_xor_si128(tmp2, tmp0B); \
404     tmp2 = _mm_xor_si128(tmp2, tmp1B); \
405     tmp2 = _mm_xor_si128(tmp2, tmp8); \
406     tmp3 = _mm_xor_si128(tmp3, tmp2); \
407     tmp2B = _mm_xor_si128(tmp2B, tmp3); \
408 \
409     accv = tmp2B; \
410 } while(0)
411 
412 #define XORx(a)                                                       \
413         temp##a = _mm_xor_si128(temp##a,                              \
414                                 _mm_loadu_si128((const __m128i *) (in + a * 16)))
415 
416 #define LOADx(a)                                                      \
417     __m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16))
418 
419 /* full encrypt & checksum 8 blocks at once */
420 #define aesni_encrypt8full(out_, n_, rkeys, in_, accum, hv_, h2v_, h3v_, h4v_, rev) \
421 do { \
422     unsigned char *out = out_; \
423     uint32_t *n = n_; \
424     const unsigned char *in = in_; \
425     const __m128i hv = hv_; \
426     const __m128i h2v = h2v_; \
427     const __m128i h3v = h3v_; \
428     const __m128i h4v = h4v_; \
429     const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
430     __m128i       accv_; \
431     int           roundctr; \
432     \
433     MAKE8(NVDECLx); \
434     MAKE8(TEMPDECLx); \
435     MAKE8(NVx); \
436     MAKE8(TEMPx); \
437     for (roundctr = 1; roundctr < 14; roundctr++) { \
438         MAKE8(AESENCx); \
439     } \
440     MAKE8(AESENCLASTx); \
441     MAKE8(XORx); \
442     MAKE8(STOREx); \
443     accv_ = _mm_load_si128((const __m128i *) accum); \
444     MULREDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv_); \
445     MULREDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv_); \
446     _mm_store_si128((__m128i *) accum, accv_); \
447 } while(0)
448 
449 /* checksum 8 blocks at once */
450 #define aesni_addmul8full(in_, accum, hv_, h2v_, h3v_, h4v_, rev) \
451 do { \
452     const unsigned char *in = in_; \
453     const __m128i hv = hv_; \
454     const __m128i h2v = h2v_; \
455     const __m128i h3v = h3v_; \
456     const __m128i h4v = h4v_; \
457     __m128i accv_; \
458     \
459     MAKE8(LOADx); \
460     accv_ = _mm_load_si128((const __m128i *) accum); \
461     MULREDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv_); \
462     MULREDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv_); \
463     _mm_store_si128((__m128i *) accum, accv_); \
464 } while(0)
465 
466 /* decrypt 8 blocks at once */
467 #define aesni_decrypt8full(out_, n_, rkeys, in_) \
468 do { \
469     unsigned char       *out = out_; \
470     uint32_t            *n = n_; \
471     const unsigned char *in = in_; \
472     const __m128i        pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
473     int                  roundctr; \
474 \
475     MAKE8(NVDECLx); \
476     MAKE8(TEMPDECLx); \
477     MAKE8(NVx); \
478     MAKE8(TEMPx); \
479     for (roundctr = 1; roundctr < 14; roundctr++) { \
480         MAKE8(AESENCx); \
481     } \
482     MAKE8(AESENCLASTx); \
483     MAKE8(XORx); \
484     MAKE8(STOREx); \
485 } while(0)
486 
487 int
488 crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *ctx_,
489                                const unsigned char *k)
490 {
491     context       *ctx = (context *) ctx_;
492     __m128i       *rkeys = ctx->rkeys;
493     __m128i        zero = _mm_setzero_si128();
494     unsigned char *H = ctx->H;
495 
496     COMPILER_ASSERT((sizeof *ctx_) >= (sizeof *ctx));
497     aesni_key256_expand(k, rkeys);
498     aesni_encrypt1(H, zero, rkeys);
499 
500     return 0;
501 }
502 
503 int
504 crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c,
505                                                unsigned char *mac, unsigned long long *maclen_p,
506                                                const unsigned char *m, unsigned long long mlen,
507                                                const unsigned char *ad, unsigned long long adlen,
508                                                const unsigned char *nsec,
509                                                const unsigned char *npub,
510                                                const crypto_aead_aes256gcm_state *ctx_)
511 {
512     const __m128i       rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
513     const context      *ctx = (const context *) ctx_;
514     const __m128i      *rkeys = ctx->rkeys;
515     __m128i             Hv, H2v, H3v, H4v, accv;
516     unsigned long long  i, j;
517     unsigned long long  adlen_rnd64 = adlen & ~63ULL;
518     unsigned long long  mlen_rnd128 = mlen & ~127ULL;
519     CRYPTO_ALIGN(16) uint32_t      n2[4];
520     CRYPTO_ALIGN(16) unsigned char H[16];
521     CRYPTO_ALIGN(16) unsigned char T[16];
522     CRYPTO_ALIGN(16) unsigned char accum[16];
523     CRYPTO_ALIGN(16) unsigned char fb[16];
524 
525     (void) nsec;
526     memcpy(H, ctx->H, sizeof H);
527     if (mlen > crypto_aead_aes256gcm_MESSAGEBYTES_MAX) {
528         sodium_misuse(); /* LCOV_EXCL_LINE */
529     }
530     memcpy(&n2[0], npub, 3 * 4);
531     n2[3] = 0x01000000;
532     aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys);
533     {
534         uint64_t x;
535         x = _bswap64((uint64_t) (8 * adlen));
536         memcpy(&fb[0], &x, sizeof x);
537         x = _bswap64((uint64_t) (8 * mlen));
538         memcpy(&fb[8], &x, sizeof x);
539     }
540     /* we store H (and it's power) byte-reverted once and for all */
541     Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev);
542     _mm_store_si128((__m128i *) H, Hv);
543     H2v = mulv(Hv, Hv);
544     H3v = mulv(H2v, Hv);
545     H4v = mulv(H3v, Hv);
546 
547     accv = _mm_setzero_si128();
548     /* unrolled by 4 GCM (by 8 doesn't improve using MULREDUCE4) */
549     for (i = 0; i < adlen_rnd64; i += 64) {
550         __m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0));
551         __m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16));
552         __m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32));
553         __m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48));
554         MULREDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
555     }
556     _mm_store_si128((__m128i *) accum, accv);
557 
558     /* GCM remainder loop */
559     for (i = adlen_rnd64; i < adlen; i += 16) {
560         unsigned int blocklen = 16;
561 
562         if (i + (unsigned long long) blocklen > adlen) {
563             blocklen = (unsigned int) (adlen - i);
564         }
565         addmul(accum, ad + i, blocklen, H);
566     }
567 
568 /* this only does 8 full blocks, so no fancy bounds checking is necessary*/
569 #define LOOPRND128                                                                                   \
570     do {                                                                                             \
571         const int iter = 8;                                                                          \
572         const int lb = iter * 16;                                                                    \
573                                                                                                      \
574         for (i = 0; i < mlen_rnd128; i += lb) {                                                      \
575             aesni_encrypt8full(c + i, n2, rkeys, m + i, accum, Hv, H2v, H3v, H4v, rev);              \
576         }                                                                                            \
577     } while(0)
578 
579 /* remainder loop, with the slower GCM update to accommodate partial blocks */
580 #define LOOPRMD128                                           \
581     do {                                                     \
582         const int iter = 8;                                  \
583         const int lb = iter * 16;                            \
584                                                              \
585         for (i = mlen_rnd128; i < mlen; i += lb) {           \
586             CRYPTO_ALIGN(16) unsigned char outni[8 * 16];    \
587             unsigned long long mj = lb;                      \
588                                                              \
589             aesni_encrypt8(outni, n2, rkeys);                \
590             if ((i + mj) >= mlen) {                          \
591                 mj = mlen - i;                               \
592             }                                                \
593             for (j = 0; j < mj; j++) {                       \
594                 c[i + j] = m[i + j] ^ outni[j];              \
595             }                                                \
596             for (j = 0; j < mj; j += 16) {                   \
597                 unsigned int bl = 16;                        \
598                                                              \
599                 if (j + (unsigned long long) bl >= mj) {     \
600                     bl = (unsigned int) (mj - j);            \
601                 }                                            \
602                 addmul(accum, c + i + j, bl, H);             \
603             }                                                \
604         }                                                    \
605     } while(0)
606 
607     n2[3] &= 0x00ffffff;
608     COUNTER_INC2(n2);
609     LOOPRND128;
610     LOOPRMD128;
611 
612     addmul(accum, fb, 16, H);
613 
614     for (i = 0; i < 16; ++i) {
615         mac[i] = T[i] ^ accum[15 - i];
616     }
617     if (maclen_p != NULL) {
618         *maclen_p = 16;
619     }
620     return 0;
621 }
622 
623 int
624 crypto_aead_aes256gcm_encrypt_afternm(unsigned char *c, unsigned long long *clen_p,
625                                       const unsigned char *m, unsigned long long mlen,
626                                       const unsigned char *ad, unsigned long long adlen,
627                                       const unsigned char *nsec,
628                                       const unsigned char *npub,
629                                       const crypto_aead_aes256gcm_state *ctx_)
630 {
631     int ret = crypto_aead_aes256gcm_encrypt_detached_afternm(c,
632                                                              c + mlen, NULL,
633                                                              m, mlen,
634                                                              ad, adlen,
635                                                              nsec, npub, ctx_);
636     if (clen_p != NULL) {
637         *clen_p = mlen + crypto_aead_aes256gcm_ABYTES;
638     }
639     return ret;
640 }
641 
642 int
643 crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *nsec,
644                                                const unsigned char *c, unsigned long long clen,
645                                                const unsigned char *mac,
646                                                const unsigned char *ad, unsigned long long adlen,
647                                                const unsigned char *npub,
648                                                const crypto_aead_aes256gcm_state *ctx_)
649 {
650     const __m128i       rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
651     const context      *ctx = (const context *) ctx_;
652     const __m128i      *rkeys = ctx->rkeys;
653     __m128i             Hv, H2v, H3v, H4v, accv;
654     unsigned long long  i, j;
655     unsigned long long  adlen_rnd64 = adlen & ~63ULL;
656     unsigned long long  mlen;
657     unsigned long long  mlen_rnd128;
658     CRYPTO_ALIGN(16) uint32_t      n2[4];
659     CRYPTO_ALIGN(16) unsigned char H[16];
660     CRYPTO_ALIGN(16) unsigned char T[16];
661     CRYPTO_ALIGN(16) unsigned char accum[16];
662     CRYPTO_ALIGN(16) unsigned char fb[16];
663 
664     (void) nsec;
665     if (clen > crypto_aead_aes256gcm_MESSAGEBYTES_MAX) {
666         sodium_misuse(); /* LCOV_EXCL_LINE */
667     }
668     mlen = clen;
669 
670     memcpy(&n2[0], npub, 3 * 4);
671     n2[3] = 0x01000000;
672     aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys);
673 
674     {
675         uint64_t x;
676         x = _bswap64((uint64_t)(8 * adlen));
677         memcpy(&fb[0], &x, sizeof x);
678         x = _bswap64((uint64_t)(8 * mlen));
679         memcpy(&fb[8], &x, sizeof x);
680     }
681 
682     memcpy(H, ctx->H, sizeof H);
683     Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev);
684     _mm_store_si128((__m128i *) H, Hv);
685     H2v = mulv(Hv, Hv);
686     H3v = mulv(H2v, Hv);
687     H4v = mulv(H3v, Hv);
688 
689     accv = _mm_setzero_si128();
690     for (i = 0; i < adlen_rnd64; i += 64) {
691         __m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0));
692         __m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16));
693         __m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32));
694         __m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48));
695         MULREDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
696     }
697     _mm_store_si128((__m128i *) accum, accv);
698 
699     for (i = adlen_rnd64; i < adlen; i += 16) {
700         unsigned int blocklen = 16;
701         if (i + (unsigned long long) blocklen > adlen) {
702             blocklen = (unsigned int) (adlen - i);
703         }
704         addmul(accum, ad + i, blocklen, H);
705     }
706 
707     mlen_rnd128 = mlen & ~127ULL;
708 
709 #define LOOPACCUMDRND128                                                                          \
710     do {                                                                                          \
711         const int iter = 8;                                                                       \
712         const int lb = iter * 16;                                                                 \
713         for (i = 0; i < mlen_rnd128; i += lb) {                                                   \
714             aesni_addmul8full(c + i, accum, Hv, H2v, H3v, H4v, rev);                              \
715         }                                                                                         \
716     } while(0)
717 
718 #define LOOPDRND128                                                                               \
719     do {                                                                                          \
720         const int iter = 8;                                                                       \
721         const int lb = iter * 16;                                                                 \
722                                                                                                   \
723         for (i = 0; i < mlen_rnd128; i += lb) {                                                   \
724             aesni_decrypt8full(m + i, n2, rkeys, c + i);                                          \
725         }                                                                                         \
726     } while(0)
727 
728 #define LOOPACCUMDRMD128                                     \
729     do {                                                     \
730         const int iter = 8;                                  \
731         const int lb = iter * 16;                            \
732                                                              \
733         for (i = mlen_rnd128; i < mlen; i += lb) {           \
734             unsigned long long mj = lb;                      \
735                                                              \
736             if ((i + mj) >= mlen) {                          \
737                 mj = mlen - i;                               \
738             }                                                \
739             for (j = 0; j < mj; j += 16) {                   \
740                 unsigned int bl = 16;                        \
741                                                              \
742                 if (j + (unsigned long long) bl >= mj) {     \
743                     bl = (unsigned int) (mj - j);            \
744                 }                                            \
745                 addmul(accum, c + i + j, bl, H);             \
746             }                                                \
747         }                                                    \
748     } while(0)
749 
750 #define LOOPDRMD128                                          \
751     do {                                                     \
752         const int iter = 8;                                  \
753         const int lb = iter * 16;                            \
754                                                              \
755         for (i = mlen_rnd128; i < mlen; i += lb) {           \
756             CRYPTO_ALIGN(16) unsigned char outni[8 * 16];    \
757             unsigned long long mj = lb;                      \
758                                                              \
759             if ((i + mj) >= mlen) {                          \
760                 mj = mlen - i;                               \
761             }                                                \
762             aesni_encrypt8(outni, n2, rkeys);                \
763             for (j = 0; j < mj; j++) {                       \
764                 m[i + j] = c[i + j] ^ outni[j];              \
765             }                                                \
766         }                                                    \
767     } while(0)
768 
769     n2[3] &= 0x00ffffff;
770 
771     COUNTER_INC2(n2);
772     LOOPACCUMDRND128;
773     LOOPACCUMDRMD128;
774     addmul(accum, fb, 16, H);
775     {
776         unsigned char d = 0;
777 
778         for (i = 0; i < 16; i++) {
779             d |= (mac[i] ^ (T[i] ^ accum[15 - i]));
780         }
781         if (d != 0) {
782             if (m != NULL) {
783                 memset(m, 0, mlen);
784             }
785             return -1;
786         }
787         if (m == NULL) {
788             return 0;
789         }
790     }
791     n2[3] = 0U;
792     COUNTER_INC2(n2);
793     LOOPDRND128;
794     LOOPDRMD128;
795 
796     return 0;
797 }
798 
799 int
800 crypto_aead_aes256gcm_decrypt_afternm(unsigned char *m, unsigned long long *mlen_p,
801                                       unsigned char *nsec,
802                                       const unsigned char *c, unsigned long long clen,
803                                       const unsigned char *ad, unsigned long long adlen,
804                                       const unsigned char *npub,
805                                       const crypto_aead_aes256gcm_state *ctx_)
806 {
807     unsigned long long mlen = 0ULL;
808     int                ret = -1;
809 
810     if (clen >= crypto_aead_aes256gcm_ABYTES) {
811         ret = crypto_aead_aes256gcm_decrypt_detached_afternm
812             (m, nsec, c, clen - crypto_aead_aes256gcm_ABYTES,
813              c + clen - crypto_aead_aes256gcm_ABYTES,
814              ad, adlen, npub, ctx_);
815     }
816     if (mlen_p != NULL) {
817         if (ret == 0) {
818             mlen = clen - crypto_aead_aes256gcm_ABYTES;
819         }
820         *mlen_p = mlen;
821     }
822     return ret;
823 }
824 
825 int
826 crypto_aead_aes256gcm_encrypt_detached(unsigned char *c,
827                                        unsigned char *mac,
828                                        unsigned long long *maclen_p,
829                                        const unsigned char *m,
830                                        unsigned long long mlen,
831                                        const unsigned char *ad,
832                                        unsigned long long adlen,
833                                        const unsigned char *nsec,
834                                        const unsigned char *npub,
835                                        const unsigned char *k)
836 {
837     CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx;
838 
839     crypto_aead_aes256gcm_beforenm(&ctx, k);
840 
841     return crypto_aead_aes256gcm_encrypt_detached_afternm
842         (c, mac, maclen_p, m, mlen, ad, adlen, nsec, npub,
843             (const crypto_aead_aes256gcm_state *) &ctx);
844 }
845 
846 int
847 crypto_aead_aes256gcm_encrypt(unsigned char *c,
848                               unsigned long long *clen_p,
849                               const unsigned char *m,
850                               unsigned long long mlen,
851                               const unsigned char *ad,
852                               unsigned long long adlen,
853                               const unsigned char *nsec,
854                               const unsigned char *npub,
855                               const unsigned char *k)
856 {
857     CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx;
858     int ret;
859 
860     crypto_aead_aes256gcm_beforenm(&ctx, k);
861 
862     ret = crypto_aead_aes256gcm_encrypt_afternm
863         (c, clen_p, m, mlen, ad, adlen, nsec, npub,
864             (const crypto_aead_aes256gcm_state *) &ctx);
865     sodium_memzero(ctx, sizeof ctx);
866 
867     return ret;
868 }
869 
870 int
871 crypto_aead_aes256gcm_decrypt_detached(unsigned char *m,
872                                        unsigned char *nsec,
873                                        const unsigned char *c,
874                                        unsigned long long clen,
875                                        const unsigned char *mac,
876                                        const unsigned char *ad,
877                                        unsigned long long adlen,
878                                        const unsigned char *npub,
879                                        const unsigned char *k)
880 {
881     CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx;
882 
883     crypto_aead_aes256gcm_beforenm(&ctx, k);
884 
885     return crypto_aead_aes256gcm_decrypt_detached_afternm
886         (m, nsec, c, clen, mac, ad, adlen, npub,
887             (const crypto_aead_aes256gcm_state *) &ctx);
888 }
889 
890 int
891 crypto_aead_aes256gcm_decrypt(unsigned char *m,
892                               unsigned long long *mlen_p,
893                               unsigned char *nsec,
894                               const unsigned char *c,
895                               unsigned long long clen,
896                               const unsigned char *ad,
897                               unsigned long long adlen,
898                               const unsigned char *npub,
899                               const unsigned char *k)
900 {
901     CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx;
902     int ret;
903 
904     crypto_aead_aes256gcm_beforenm(&ctx, k);
905 
906     ret = crypto_aead_aes256gcm_decrypt_afternm
907         (m, mlen_p, nsec, c, clen, ad, adlen, npub,
908          (const crypto_aead_aes256gcm_state *) &ctx);
909     sodium_memzero(ctx, sizeof ctx);
910 
911     return ret;
912 }
913 
914 int
915 crypto_aead_aes256gcm_is_available(void)
916 {
917     return sodium_runtime_has_pclmul() & sodium_runtime_has_aesni();
918 }
919 
920 #else
921 
922 int
923 crypto_aead_aes256gcm_encrypt_detached(unsigned char *c,
924                                        unsigned char *mac,
925                                        unsigned long long *maclen_p,
926                                        const unsigned char *m,
927                                        unsigned long long mlen,
928                                        const unsigned char *ad,
929                                        unsigned long long adlen,
930                                        const unsigned char *nsec,
931                                        const unsigned char *npub,
932                                        const unsigned char *k)
933 {
934     errno = ENOSYS;
935     return -1;
936 }
937 
938 int
939 crypto_aead_aes256gcm_encrypt(unsigned char *c, unsigned long long *clen_p,
940                               const unsigned char *m, unsigned long long mlen,
941                               const unsigned char *ad, unsigned long long adlen,
942                               const unsigned char *nsec, const unsigned char *npub,
943                               const unsigned char *k)
944 {
945     errno = ENOSYS;
946     return -1;
947 }
948 
949 int
950 crypto_aead_aes256gcm_decrypt_detached(unsigned char *m,
951                                        unsigned char *nsec,
952                                        const unsigned char *c,
953                                        unsigned long long clen,
954                                        const unsigned char *mac,
955                                        const unsigned char *ad,
956                                        unsigned long long adlen,
957                                        const unsigned char *npub,
958                                        const unsigned char *k)
959 {
960     errno = ENOSYS;
961     return -1;
962 }
963 
964 int
965 crypto_aead_aes256gcm_decrypt(unsigned char *m, unsigned long long *mlen_p,
966                               unsigned char *nsec, const unsigned char *c,
967                               unsigned long long clen, const unsigned char *ad,
968                               unsigned long long adlen, const unsigned char *npub,
969                               const unsigned char *k)
970 {
971     errno = ENOSYS;
972     return -1;
973 }
974 
975 int
976 crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *ctx_,
977                                const unsigned char *k)
978 {
979     errno = ENOSYS;
980     return -1;
981 }
982 
983 int
984 crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c,
985                                                unsigned char *mac, unsigned long long *maclen_p,
986                                                const unsigned char *m, unsigned long long mlen,
987                                                const unsigned char *ad, unsigned long long adlen,
988                                                const unsigned char *nsec,
989                                                const unsigned char *npub,
990                                                const crypto_aead_aes256gcm_state *ctx_)
991 {
992     errno = ENOSYS;
993     return -1;
994 }
995 
996 int
997 crypto_aead_aes256gcm_encrypt_afternm(unsigned char *c, unsigned long long *clen_p,
998                                       const unsigned char *m, unsigned long long mlen,
999                                       const unsigned char *ad, unsigned long long adlen,
1000                                       const unsigned char *nsec, const unsigned char *npub,
1001                                       const crypto_aead_aes256gcm_state *ctx_)
1002 {
1003     errno = ENOSYS;
1004     return -1;
1005 }
1006 
1007 int
1008 crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *nsec,
1009                                                const unsigned char *c, unsigned long long clen,
1010                                                const unsigned char *mac,
1011                                                const unsigned char *ad, unsigned long long adlen,
1012                                                const unsigned char *npub,
1013                                                const crypto_aead_aes256gcm_state *ctx_)
1014 {
1015     errno = ENOSYS;
1016     return -1;
1017 }
1018 
1019 int
1020 crypto_aead_aes256gcm_decrypt_afternm(unsigned char *m, unsigned long long *mlen_p,
1021                                       unsigned char *nsec,
1022                                       const unsigned char *c, unsigned long long clen,
1023                                       const unsigned char *ad, unsigned long long adlen,
1024                                       const unsigned char *npub,
1025                                       const crypto_aead_aes256gcm_state *ctx_)
1026 {
1027     errno = ENOSYS;
1028     return -1;
1029 }
1030 
1031 int
1032 crypto_aead_aes256gcm_is_available(void)
1033 {
1034     return 0;
1035 }
1036 
1037 #endif
1038 
1039 size_t
1040 crypto_aead_aes256gcm_keybytes(void)
1041 {
1042     return crypto_aead_aes256gcm_KEYBYTES;
1043 }
1044 
1045 size_t
1046 crypto_aead_aes256gcm_nsecbytes(void)
1047 {
1048     return crypto_aead_aes256gcm_NSECBYTES;
1049 }
1050 
1051 size_t
1052 crypto_aead_aes256gcm_npubbytes(void)
1053 {
1054     return crypto_aead_aes256gcm_NPUBBYTES;
1055 }
1056 
1057 size_t
1058 crypto_aead_aes256gcm_abytes(void)
1059 {
1060     return crypto_aead_aes256gcm_ABYTES;
1061 }
1062 
1063 size_t
1064 crypto_aead_aes256gcm_statebytes(void)
1065 {
1066     return (sizeof(crypto_aead_aes256gcm_state) + (size_t) 15U) & ~(size_t) 15U;
1067 }
1068 
1069 size_t
1070 crypto_aead_aes256gcm_messagebytes_max(void)
1071 {
1072     return crypto_aead_aes256gcm_MESSAGEBYTES_MAX;
1073 }
1074 
1075 void
1076 crypto_aead_aes256gcm_keygen(unsigned char k[crypto_aead_aes256gcm_KEYBYTES])
1077 {
1078     randombytes_buf(k, crypto_aead_aes256gcm_KEYBYTES);
1079 }
1080