1 2 /* 3 * AES256-GCM, based on the "Intel Carry-Less Multiplication Instruction and its Usage for Computing 4 * the GCM Mode" paper and reference code, using the aggregated reduction method. 5 * Originally adapted by Romain Dolbeau. 6 */ 7 8 #include <errno.h> 9 #include <stdint.h> 10 #include <stdlib.h> 11 #include <string.h> 12 13 #include "core.h" 14 #include "crypto_aead_aes256gcm.h" 15 #include "export.h" 16 #include "private/common.h" 17 #include "private/sse2_64_32.h" 18 #include "randombytes.h" 19 #include "runtime.h" 20 #include "utils.h" 21 22 #if defined(HAVE_TMMINTRIN_H) && defined(HAVE_WMMINTRIN_H) 23 24 # ifdef __GNUC__ 25 # pragma GCC target("ssse3") 26 # pragma GCC target("aes") 27 # pragma GCC target("pclmul") 28 # endif 29 30 #include <tmmintrin.h> 31 #include <wmmintrin.h> 32 33 #ifndef ENOSYS 34 # define ENOSYS ENXIO 35 #endif 36 37 #if defined(__INTEL_COMPILER) || defined(_bswap64) 38 #elif defined(_MSC_VER) 39 # define _bswap64(a) _byteswap_uint64(a) 40 #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) 41 # define _bswap64(a) __builtin_bswap64(a) 42 #else 43 static inline uint64_t 44 _bswap64(const uint64_t x) 45 { 46 return 47 ((x << 56) & 0xFF00000000000000UL) | ((x << 40) & 0x00FF000000000000UL) | 48 ((x << 24) & 0x0000FF0000000000UL) | ((x << 8) & 0x000000FF00000000UL) | 49 ((x >> 8) & 0x00000000FF000000UL) | ((x >> 24) & 0x0000000000FF0000UL) | 50 ((x >> 40) & 0x000000000000FF00UL) | ((x >> 56) & 0x00000000000000FFUL); 51 } 52 #endif 53 54 typedef struct context { 55 CRYPTO_ALIGN(16) unsigned char H[16]; 56 __m128i rkeys[16]; 57 } context; 58 59 static inline void 60 aesni_key256_expand(const unsigned char *key, __m128i * const rkeys) 61 { 62 __m128i X0, X1, X2, X3; 63 int i = 0; 64 65 X0 = _mm_loadu_si128((const __m128i *) &key[0]); 66 rkeys[i++] = X0; 67 68 X2 = _mm_loadu_si128((const __m128i *) &key[16]); 69 rkeys[i++] = X2; 70 71 #define EXPAND_KEY_1(S) do { \ 72 X1 = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(X2, (S)), 0xff); \ 73 X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X0), 0x10)); \ 74 X0 = _mm_xor_si128(X0, X3); \ 75 X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X0), 0x8c)); \ 76 X0 = _mm_xor_si128(_mm_xor_si128(X0, X3), X1); \ 77 rkeys[i++] = X0; \ 78 } while (0) 79 80 #define EXPAND_KEY_2(S) do { \ 81 X1 = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(X0, (S)), 0xaa); \ 82 X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X2), 0x10)); \ 83 X2 = _mm_xor_si128(X2, X3); \ 84 X3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(X3), _mm_castsi128_ps(X2), 0x8c)); \ 85 X2 = _mm_xor_si128(_mm_xor_si128(X2, X3), X1); \ 86 rkeys[i++] = X2; \ 87 } while (0) 88 89 X3 = _mm_setzero_si128(); 90 EXPAND_KEY_1(0x01); EXPAND_KEY_2(0x01); 91 EXPAND_KEY_1(0x02); EXPAND_KEY_2(0x02); 92 EXPAND_KEY_1(0x04); EXPAND_KEY_2(0x04); 93 EXPAND_KEY_1(0x08); EXPAND_KEY_2(0x08); 94 EXPAND_KEY_1(0x10); EXPAND_KEY_2(0x10); 95 EXPAND_KEY_1(0x20); EXPAND_KEY_2(0x20); 96 EXPAND_KEY_1(0x40); 97 } 98 99 /** single, by-the-book AES encryption with AES-NI */ 100 static inline void 101 aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys) 102 { 103 __m128i temp = _mm_xor_si128(nv, rkeys[0]); 104 105 temp = _mm_aesenc_si128(temp, rkeys[1]); 106 temp = _mm_aesenc_si128(temp, rkeys[2]); 107 temp = _mm_aesenc_si128(temp, rkeys[3]); 108 temp = _mm_aesenc_si128(temp, rkeys[4]); 109 temp = _mm_aesenc_si128(temp, rkeys[5]); 110 temp = _mm_aesenc_si128(temp, rkeys[6]); 111 temp = _mm_aesenc_si128(temp, rkeys[7]); 112 temp = _mm_aesenc_si128(temp, rkeys[8]); 113 temp = _mm_aesenc_si128(temp, rkeys[9]); 114 temp = _mm_aesenc_si128(temp, rkeys[10]); 115 temp = _mm_aesenc_si128(temp, rkeys[11]); 116 temp = _mm_aesenc_si128(temp, rkeys[12]); 117 temp = _mm_aesenc_si128(temp, rkeys[13]); 118 119 temp = _mm_aesenclast_si128(temp, rkeys[14]); 120 _mm_storeu_si128((__m128i *) out, temp); 121 } 122 123 /** multiple-blocks-at-once AES encryption with AES-NI ; 124 on Haswell, aesenc has a latency of 7 and a throughput of 1 125 so the sequence of aesenc should be bubble-free if you 126 have at least 8 blocks. Let's build an arbitratry-sized 127 function */ 128 /* Step 1 : loading the nonce */ 129 /* load & increment the n vector (non-vectorized, unused for now) */ 130 #define NVDECLx(a) \ 131 __m128i nv##a 132 133 #define NVx(a) \ 134 nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) n), pt); \ 135 n[3]++ 136 137 /* Step 2 : define value in round one (xor with subkey #0, aka key) */ 138 #define TEMPDECLx(a) \ 139 __m128i temp##a 140 141 #define TEMPx(a) \ 142 temp##a = _mm_xor_si128(nv##a, rkeys[0]) 143 144 /* Step 3: one round of AES */ 145 #define AESENCx(a) \ 146 temp##a = _mm_aesenc_si128(temp##a, rkeys[roundctr]) 147 148 /* Step 4: last round of AES */ 149 #define AESENCLASTx(a) \ 150 temp##a = _mm_aesenclast_si128(temp##a, rkeys[14]) 151 152 /* Step 5: store result */ 153 #define STOREx(a) \ 154 _mm_storeu_si128((__m128i *) (out + (a * 16)), temp##a) 155 156 /* all the MAKE* macros are for automatic explicit unrolling */ 157 #define MAKE4(X) \ 158 X(0); \ 159 X(1); \ 160 X(2); \ 161 X(3) 162 163 #define MAKE8(X) \ 164 X(0); \ 165 X(1); \ 166 X(2); \ 167 X(3); \ 168 X(4); \ 169 X(5); \ 170 X(6); \ 171 X(7) 172 173 #define COUNTER_INC2(N) (N)[3] += 2 174 175 /* create a function of unrolling N ; the MAKEN is the unrolling 176 macro, defined above. The N in MAKEN must match N, obviously. */ 177 #define FUNC(N, MAKEN) \ 178 static inline void aesni_encrypt##N(unsigned char *out, uint32_t *n, const __m128i *rkeys) \ 179 { \ 180 const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ 181 int roundctr; \ 182 MAKEN(NVDECLx); \ 183 MAKEN(TEMPDECLx); \ 184 \ 185 MAKEN(NVx); \ 186 MAKEN(TEMPx); \ 187 for (roundctr = 1; roundctr < 14; roundctr++) { \ 188 MAKEN(AESENCx); \ 189 } \ 190 MAKEN(AESENCLASTx); \ 191 MAKEN(STOREx); \ 192 } 193 194 FUNC(8, MAKE8) 195 196 /* all GF(2^128) fnctions are by the book, meaning this one: 197 <https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf> 198 */ 199 200 static inline void 201 addmul(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsigned char *b) 202 { 203 const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 204 __m128i A, B, C; 205 __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; 206 __m128i tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp18; 207 __m128i tmp19, tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; 208 __m128i tmp28, tmp29, tmp30, tmp31, tmp32, tmp33, tmp34, tmp35, tmp36; 209 210 if (xlen >= 16) { 211 A = _mm_loadu_si128((const __m128i *) a); 212 } else { 213 CRYPTO_ALIGN(16) unsigned char padded[16]; 214 unsigned int i; 215 216 memset(padded, 0, 16); 217 for (i = 0; i < xlen; i++) { 218 padded[i] = a[i]; 219 } 220 A = _mm_load_si128((const __m128i *) padded); 221 } 222 A = _mm_shuffle_epi8(A, rev); 223 B = _mm_loadu_si128((const __m128i *) b); 224 C = _mm_loadu_si128((const __m128i *) c); 225 A = _mm_xor_si128(A, C); 226 tmp3 = _mm_clmulepi64_si128(A, B, 0x00); 227 tmp4 = _mm_clmulepi64_si128(A, B, 0x10); 228 tmp5 = _mm_clmulepi64_si128(A, B, 0x01); 229 tmp6 = _mm_clmulepi64_si128(A, B, 0x11); 230 tmp10 = _mm_xor_si128(tmp4, tmp5); 231 tmp13 = _mm_slli_si128(tmp10, 8); 232 tmp11 = _mm_srli_si128(tmp10, 8); 233 tmp15 = _mm_xor_si128(tmp3, tmp13); 234 tmp17 = _mm_xor_si128(tmp6, tmp11); 235 tmp7 = _mm_srli_epi32(tmp15, 31); 236 tmp8 = _mm_srli_epi32(tmp17, 31); 237 tmp16 = _mm_slli_epi32(tmp15, 1); 238 tmp18 = _mm_slli_epi32(tmp17, 1); 239 tmp9 = _mm_srli_si128(tmp7, 12); 240 tmp22 = _mm_slli_si128(tmp8, 4); 241 tmp25 = _mm_slli_si128(tmp7, 4); 242 tmp29 = _mm_or_si128(tmp16, tmp25); 243 tmp19 = _mm_or_si128(tmp18, tmp22); 244 tmp20 = _mm_or_si128(tmp19, tmp9); 245 tmp26 = _mm_slli_epi32(tmp29, 31); 246 tmp23 = _mm_slli_epi32(tmp29, 30); 247 tmp32 = _mm_slli_epi32(tmp29, 25); 248 tmp27 = _mm_xor_si128(tmp26, tmp23); 249 tmp28 = _mm_xor_si128(tmp27, tmp32); 250 tmp24 = _mm_srli_si128(tmp28, 4); 251 tmp33 = _mm_slli_si128(tmp28, 12); 252 tmp30 = _mm_xor_si128(tmp29, tmp33); 253 tmp2 = _mm_srli_epi32(tmp30, 1); 254 tmp12 = _mm_srli_epi32(tmp30, 2); 255 tmp14 = _mm_srli_epi32(tmp30, 7); 256 tmp34 = _mm_xor_si128(tmp2, tmp12); 257 tmp35 = _mm_xor_si128(tmp34, tmp14); 258 tmp36 = _mm_xor_si128(tmp35, tmp24); 259 tmp31 = _mm_xor_si128(tmp30, tmp36); 260 tmp21 = _mm_xor_si128(tmp20, tmp31); 261 _mm_storeu_si128((__m128i *) c, tmp21); 262 } 263 264 /* pure multiplication, for pre-computing powers of H */ 265 static inline __m128i 266 mulv(__m128i A, __m128i B) 267 { 268 __m128i tmp3 = _mm_clmulepi64_si128(A, B, 0x00); 269 __m128i tmp4 = _mm_clmulepi64_si128(A, B, 0x10); 270 __m128i tmp5 = _mm_clmulepi64_si128(A, B, 0x01); 271 __m128i tmp6 = _mm_clmulepi64_si128(A, B, 0x11); 272 __m128i tmp10 = _mm_xor_si128(tmp4, tmp5); 273 __m128i tmp13 = _mm_slli_si128(tmp10, 8); 274 __m128i tmp11 = _mm_srli_si128(tmp10, 8); 275 __m128i tmp15 = _mm_xor_si128(tmp3, tmp13); 276 __m128i tmp17 = _mm_xor_si128(tmp6, tmp11); 277 __m128i tmp7 = _mm_srli_epi32(tmp15, 31); 278 __m128i tmp8 = _mm_srli_epi32(tmp17, 31); 279 __m128i tmp16 = _mm_slli_epi32(tmp15, 1); 280 __m128i tmp18 = _mm_slli_epi32(tmp17, 1); 281 __m128i tmp9 = _mm_srli_si128(tmp7, 12); 282 __m128i tmp22 = _mm_slli_si128(tmp8, 4); 283 __m128i tmp25 = _mm_slli_si128(tmp7, 4); 284 __m128i tmp29 = _mm_or_si128(tmp16, tmp25); 285 __m128i tmp19 = _mm_or_si128(tmp18, tmp22); 286 __m128i tmp20 = _mm_or_si128(tmp19, tmp9); 287 __m128i tmp26 = _mm_slli_epi32(tmp29, 31); 288 __m128i tmp23 = _mm_slli_epi32(tmp29, 30); 289 __m128i tmp32 = _mm_slli_epi32(tmp29, 25); 290 __m128i tmp27 = _mm_xor_si128(tmp26, tmp23); 291 __m128i tmp28 = _mm_xor_si128(tmp27, tmp32); 292 __m128i tmp24 = _mm_srli_si128(tmp28, 4); 293 __m128i tmp33 = _mm_slli_si128(tmp28, 12); 294 __m128i tmp30 = _mm_xor_si128(tmp29, tmp33); 295 __m128i tmp2 = _mm_srli_epi32(tmp30, 1); 296 __m128i tmp12 = _mm_srli_epi32(tmp30, 2); 297 __m128i tmp14 = _mm_srli_epi32(tmp30, 7); 298 __m128i tmp34 = _mm_xor_si128(tmp2, tmp12); 299 __m128i tmp35 = _mm_xor_si128(tmp34, tmp14); 300 __m128i tmp36 = _mm_xor_si128(tmp35, tmp24); 301 __m128i tmp31 = _mm_xor_si128(tmp30, tmp36); 302 __m128i C = _mm_xor_si128(tmp20, tmp31); 303 304 return C; 305 } 306 307 /* 4 multiply-accumulate at once; again 308 <https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf> 309 for the Aggregated Reduction Method & sample code. 310 Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */ 311 312 #define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B 313 #define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev) 314 #define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00) 315 #define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11) 316 #define RED_MUL_MID(a) \ 317 tmp##a = _mm_shuffle_epi32(H##a, 0x4e); \ 318 tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e); \ 319 tmp##a = _mm_xor_si128(tmp##a, H##a); \ 320 tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \ 321 tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00) 322 323 #define MULREDUCE4(rev, H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, accv) \ 324 do { \ 325 MAKE4(RED_DECL); \ 326 __m128i lo, hi; \ 327 __m128i tmp8, tmp9; \ 328 __m128i H0 = H0_; \ 329 __m128i H1 = H1_; \ 330 __m128i H2 = H2_; \ 331 __m128i H3 = H3_; \ 332 __m128i X0 = X0_; \ 333 __m128i X1 = X1_; \ 334 __m128i X2 = X2_; \ 335 __m128i X3 = X3_; \ 336 \ 337 /* byte-revert the inputs & xor the first one into the accumulator */ \ 338 \ 339 MAKE4(RED_SHUFFLE); \ 340 X3 = _mm_xor_si128(X3, accv); \ 341 \ 342 /* 4 low H*X (x0*h0) */ \ 343 \ 344 MAKE4(RED_MUL_LOW); \ 345 lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); \ 346 lo = _mm_xor_si128(lo, H2_X2_lo); \ 347 lo = _mm_xor_si128(lo, H3_X3_lo); \ 348 \ 349 /* 4 high H*X (x1*h1) */ \ 350 \ 351 MAKE4(RED_MUL_HIGH); \ 352 hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); \ 353 hi = _mm_xor_si128(hi, H2_X2_hi); \ 354 hi = _mm_xor_si128(hi, H3_X3_hi); \ 355 \ 356 /* 4 middle H*X, using Karatsuba, i.e. \ 357 x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 \ 358 we already have all x1y1 & x0y0 (accumulated in hi & lo) \ 359 (0 is low half and 1 is high half) \ 360 */ \ 361 /* permute the high and low 64 bits in H1 & X1, \ 362 so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), \ 363 then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), \ 364 and finally multiply \ 365 */ \ 366 MAKE4(RED_MUL_MID); \ 367 \ 368 /* substracts x1*h1 and x0*h0 */ \ 369 tmp0 = _mm_xor_si128(tmp0, lo); \ 370 tmp0 = _mm_xor_si128(tmp0, hi); \ 371 tmp0 = _mm_xor_si128(tmp1, tmp0); \ 372 tmp0 = _mm_xor_si128(tmp2, tmp0); \ 373 tmp0 = _mm_xor_si128(tmp3, tmp0);\ 374 \ 375 /* reduction */ \ 376 tmp0B = _mm_slli_si128(tmp0, 8); \ 377 tmp0 = _mm_srli_si128(tmp0, 8); \ 378 lo = _mm_xor_si128(tmp0B, lo); \ 379 hi = _mm_xor_si128(tmp0, hi); \ 380 tmp3 = lo; \ 381 tmp2B = hi; \ 382 tmp3B = _mm_srli_epi32(tmp3, 31); \ 383 tmp8 = _mm_srli_epi32(tmp2B, 31); \ 384 tmp3 = _mm_slli_epi32(tmp3, 1); \ 385 tmp2B = _mm_slli_epi32(tmp2B, 1); \ 386 tmp9 = _mm_srli_si128(tmp3B, 12); \ 387 tmp8 = _mm_slli_si128(tmp8, 4); \ 388 tmp3B = _mm_slli_si128(tmp3B, 4); \ 389 tmp3 = _mm_or_si128(tmp3, tmp3B); \ 390 tmp2B = _mm_or_si128(tmp2B, tmp8); \ 391 tmp2B = _mm_or_si128(tmp2B, tmp9); \ 392 tmp3B = _mm_slli_epi32(tmp3, 31); \ 393 tmp8 = _mm_slli_epi32(tmp3, 30); \ 394 tmp9 = _mm_slli_epi32(tmp3, 25); \ 395 tmp3B = _mm_xor_si128(tmp3B, tmp8); \ 396 tmp3B = _mm_xor_si128(tmp3B, tmp9); \ 397 tmp8 = _mm_srli_si128(tmp3B, 4); \ 398 tmp3B = _mm_slli_si128(tmp3B, 12); \ 399 tmp3 = _mm_xor_si128(tmp3, tmp3B); \ 400 tmp2 = _mm_srli_epi32(tmp3, 1); \ 401 tmp0B = _mm_srli_epi32(tmp3, 2); \ 402 tmp1B = _mm_srli_epi32(tmp3, 7); \ 403 tmp2 = _mm_xor_si128(tmp2, tmp0B); \ 404 tmp2 = _mm_xor_si128(tmp2, tmp1B); \ 405 tmp2 = _mm_xor_si128(tmp2, tmp8); \ 406 tmp3 = _mm_xor_si128(tmp3, tmp2); \ 407 tmp2B = _mm_xor_si128(tmp2B, tmp3); \ 408 \ 409 accv = tmp2B; \ 410 } while(0) 411 412 #define XORx(a) \ 413 temp##a = _mm_xor_si128(temp##a, \ 414 _mm_loadu_si128((const __m128i *) (in + a * 16))) 415 416 #define LOADx(a) \ 417 __m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16)) 418 419 /* full encrypt & checksum 8 blocks at once */ 420 #define aesni_encrypt8full(out_, n_, rkeys, in_, accum, hv_, h2v_, h3v_, h4v_, rev) \ 421 do { \ 422 unsigned char *out = out_; \ 423 uint32_t *n = n_; \ 424 const unsigned char *in = in_; \ 425 const __m128i hv = hv_; \ 426 const __m128i h2v = h2v_; \ 427 const __m128i h3v = h3v_; \ 428 const __m128i h4v = h4v_; \ 429 const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ 430 __m128i accv_; \ 431 int roundctr; \ 432 \ 433 MAKE8(NVDECLx); \ 434 MAKE8(TEMPDECLx); \ 435 MAKE8(NVx); \ 436 MAKE8(TEMPx); \ 437 for (roundctr = 1; roundctr < 14; roundctr++) { \ 438 MAKE8(AESENCx); \ 439 } \ 440 MAKE8(AESENCLASTx); \ 441 MAKE8(XORx); \ 442 MAKE8(STOREx); \ 443 accv_ = _mm_load_si128((const __m128i *) accum); \ 444 MULREDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv_); \ 445 MULREDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv_); \ 446 _mm_store_si128((__m128i *) accum, accv_); \ 447 } while(0) 448 449 /* checksum 8 blocks at once */ 450 #define aesni_addmul8full(in_, accum, hv_, h2v_, h3v_, h4v_, rev) \ 451 do { \ 452 const unsigned char *in = in_; \ 453 const __m128i hv = hv_; \ 454 const __m128i h2v = h2v_; \ 455 const __m128i h3v = h3v_; \ 456 const __m128i h4v = h4v_; \ 457 __m128i accv_; \ 458 \ 459 MAKE8(LOADx); \ 460 accv_ = _mm_load_si128((const __m128i *) accum); \ 461 MULREDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv_); \ 462 MULREDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv_); \ 463 _mm_store_si128((__m128i *) accum, accv_); \ 464 } while(0) 465 466 /* decrypt 8 blocks at once */ 467 #define aesni_decrypt8full(out_, n_, rkeys, in_) \ 468 do { \ 469 unsigned char *out = out_; \ 470 uint32_t *n = n_; \ 471 const unsigned char *in = in_; \ 472 const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ 473 int roundctr; \ 474 \ 475 MAKE8(NVDECLx); \ 476 MAKE8(TEMPDECLx); \ 477 MAKE8(NVx); \ 478 MAKE8(TEMPx); \ 479 for (roundctr = 1; roundctr < 14; roundctr++) { \ 480 MAKE8(AESENCx); \ 481 } \ 482 MAKE8(AESENCLASTx); \ 483 MAKE8(XORx); \ 484 MAKE8(STOREx); \ 485 } while(0) 486 487 int 488 crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *ctx_, 489 const unsigned char *k) 490 { 491 context *ctx = (context *) ctx_; 492 __m128i *rkeys = ctx->rkeys; 493 __m128i zero = _mm_setzero_si128(); 494 unsigned char *H = ctx->H; 495 496 COMPILER_ASSERT((sizeof *ctx_) >= (sizeof *ctx)); 497 aesni_key256_expand(k, rkeys); 498 aesni_encrypt1(H, zero, rkeys); 499 500 return 0; 501 } 502 503 int 504 crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, 505 unsigned char *mac, unsigned long long *maclen_p, 506 const unsigned char *m, unsigned long long mlen, 507 const unsigned char *ad, unsigned long long adlen, 508 const unsigned char *nsec, 509 const unsigned char *npub, 510 const crypto_aead_aes256gcm_state *ctx_) 511 { 512 const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 513 const context *ctx = (const context *) ctx_; 514 const __m128i *rkeys = ctx->rkeys; 515 __m128i Hv, H2v, H3v, H4v, accv; 516 unsigned long long i, j; 517 unsigned long long adlen_rnd64 = adlen & ~63ULL; 518 unsigned long long mlen_rnd128 = mlen & ~127ULL; 519 CRYPTO_ALIGN(16) uint32_t n2[4]; 520 CRYPTO_ALIGN(16) unsigned char H[16]; 521 CRYPTO_ALIGN(16) unsigned char T[16]; 522 CRYPTO_ALIGN(16) unsigned char accum[16]; 523 CRYPTO_ALIGN(16) unsigned char fb[16]; 524 525 (void) nsec; 526 memcpy(H, ctx->H, sizeof H); 527 if (mlen > crypto_aead_aes256gcm_MESSAGEBYTES_MAX) { 528 sodium_misuse(); /* LCOV_EXCL_LINE */ 529 } 530 memcpy(&n2[0], npub, 3 * 4); 531 n2[3] = 0x01000000; 532 aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys); 533 { 534 uint64_t x; 535 x = _bswap64((uint64_t) (8 * adlen)); 536 memcpy(&fb[0], &x, sizeof x); 537 x = _bswap64((uint64_t) (8 * mlen)); 538 memcpy(&fb[8], &x, sizeof x); 539 } 540 /* we store H (and it's power) byte-reverted once and for all */ 541 Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev); 542 _mm_store_si128((__m128i *) H, Hv); 543 H2v = mulv(Hv, Hv); 544 H3v = mulv(H2v, Hv); 545 H4v = mulv(H3v, Hv); 546 547 accv = _mm_setzero_si128(); 548 /* unrolled by 4 GCM (by 8 doesn't improve using MULREDUCE4) */ 549 for (i = 0; i < adlen_rnd64; i += 64) { 550 __m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0)); 551 __m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16)); 552 __m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32)); 553 __m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48)); 554 MULREDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv); 555 } 556 _mm_store_si128((__m128i *) accum, accv); 557 558 /* GCM remainder loop */ 559 for (i = adlen_rnd64; i < adlen; i += 16) { 560 unsigned int blocklen = 16; 561 562 if (i + (unsigned long long) blocklen > adlen) { 563 blocklen = (unsigned int) (adlen - i); 564 } 565 addmul(accum, ad + i, blocklen, H); 566 } 567 568 /* this only does 8 full blocks, so no fancy bounds checking is necessary*/ 569 #define LOOPRND128 \ 570 do { \ 571 const int iter = 8; \ 572 const int lb = iter * 16; \ 573 \ 574 for (i = 0; i < mlen_rnd128; i += lb) { \ 575 aesni_encrypt8full(c + i, n2, rkeys, m + i, accum, Hv, H2v, H3v, H4v, rev); \ 576 } \ 577 } while(0) 578 579 /* remainder loop, with the slower GCM update to accommodate partial blocks */ 580 #define LOOPRMD128 \ 581 do { \ 582 const int iter = 8; \ 583 const int lb = iter * 16; \ 584 \ 585 for (i = mlen_rnd128; i < mlen; i += lb) { \ 586 CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \ 587 unsigned long long mj = lb; \ 588 \ 589 aesni_encrypt8(outni, n2, rkeys); \ 590 if ((i + mj) >= mlen) { \ 591 mj = mlen - i; \ 592 } \ 593 for (j = 0; j < mj; j++) { \ 594 c[i + j] = m[i + j] ^ outni[j]; \ 595 } \ 596 for (j = 0; j < mj; j += 16) { \ 597 unsigned int bl = 16; \ 598 \ 599 if (j + (unsigned long long) bl >= mj) { \ 600 bl = (unsigned int) (mj - j); \ 601 } \ 602 addmul(accum, c + i + j, bl, H); \ 603 } \ 604 } \ 605 } while(0) 606 607 n2[3] &= 0x00ffffff; 608 COUNTER_INC2(n2); 609 LOOPRND128; 610 LOOPRMD128; 611 612 addmul(accum, fb, 16, H); 613 614 for (i = 0; i < 16; ++i) { 615 mac[i] = T[i] ^ accum[15 - i]; 616 } 617 if (maclen_p != NULL) { 618 *maclen_p = 16; 619 } 620 return 0; 621 } 622 623 int 624 crypto_aead_aes256gcm_encrypt_afternm(unsigned char *c, unsigned long long *clen_p, 625 const unsigned char *m, unsigned long long mlen, 626 const unsigned char *ad, unsigned long long adlen, 627 const unsigned char *nsec, 628 const unsigned char *npub, 629 const crypto_aead_aes256gcm_state *ctx_) 630 { 631 int ret = crypto_aead_aes256gcm_encrypt_detached_afternm(c, 632 c + mlen, NULL, 633 m, mlen, 634 ad, adlen, 635 nsec, npub, ctx_); 636 if (clen_p != NULL) { 637 *clen_p = mlen + crypto_aead_aes256gcm_ABYTES; 638 } 639 return ret; 640 } 641 642 int 643 crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *nsec, 644 const unsigned char *c, unsigned long long clen, 645 const unsigned char *mac, 646 const unsigned char *ad, unsigned long long adlen, 647 const unsigned char *npub, 648 const crypto_aead_aes256gcm_state *ctx_) 649 { 650 const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 651 const context *ctx = (const context *) ctx_; 652 const __m128i *rkeys = ctx->rkeys; 653 __m128i Hv, H2v, H3v, H4v, accv; 654 unsigned long long i, j; 655 unsigned long long adlen_rnd64 = adlen & ~63ULL; 656 unsigned long long mlen; 657 unsigned long long mlen_rnd128; 658 CRYPTO_ALIGN(16) uint32_t n2[4]; 659 CRYPTO_ALIGN(16) unsigned char H[16]; 660 CRYPTO_ALIGN(16) unsigned char T[16]; 661 CRYPTO_ALIGN(16) unsigned char accum[16]; 662 CRYPTO_ALIGN(16) unsigned char fb[16]; 663 664 (void) nsec; 665 if (clen > crypto_aead_aes256gcm_MESSAGEBYTES_MAX) { 666 sodium_misuse(); /* LCOV_EXCL_LINE */ 667 } 668 mlen = clen; 669 670 memcpy(&n2[0], npub, 3 * 4); 671 n2[3] = 0x01000000; 672 aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys); 673 674 { 675 uint64_t x; 676 x = _bswap64((uint64_t)(8 * adlen)); 677 memcpy(&fb[0], &x, sizeof x); 678 x = _bswap64((uint64_t)(8 * mlen)); 679 memcpy(&fb[8], &x, sizeof x); 680 } 681 682 memcpy(H, ctx->H, sizeof H); 683 Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev); 684 _mm_store_si128((__m128i *) H, Hv); 685 H2v = mulv(Hv, Hv); 686 H3v = mulv(H2v, Hv); 687 H4v = mulv(H3v, Hv); 688 689 accv = _mm_setzero_si128(); 690 for (i = 0; i < adlen_rnd64; i += 64) { 691 __m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0)); 692 __m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16)); 693 __m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32)); 694 __m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48)); 695 MULREDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv); 696 } 697 _mm_store_si128((__m128i *) accum, accv); 698 699 for (i = adlen_rnd64; i < adlen; i += 16) { 700 unsigned int blocklen = 16; 701 if (i + (unsigned long long) blocklen > adlen) { 702 blocklen = (unsigned int) (adlen - i); 703 } 704 addmul(accum, ad + i, blocklen, H); 705 } 706 707 mlen_rnd128 = mlen & ~127ULL; 708 709 #define LOOPACCUMDRND128 \ 710 do { \ 711 const int iter = 8; \ 712 const int lb = iter * 16; \ 713 for (i = 0; i < mlen_rnd128; i += lb) { \ 714 aesni_addmul8full(c + i, accum, Hv, H2v, H3v, H4v, rev); \ 715 } \ 716 } while(0) 717 718 #define LOOPDRND128 \ 719 do { \ 720 const int iter = 8; \ 721 const int lb = iter * 16; \ 722 \ 723 for (i = 0; i < mlen_rnd128; i += lb) { \ 724 aesni_decrypt8full(m + i, n2, rkeys, c + i); \ 725 } \ 726 } while(0) 727 728 #define LOOPACCUMDRMD128 \ 729 do { \ 730 const int iter = 8; \ 731 const int lb = iter * 16; \ 732 \ 733 for (i = mlen_rnd128; i < mlen; i += lb) { \ 734 unsigned long long mj = lb; \ 735 \ 736 if ((i + mj) >= mlen) { \ 737 mj = mlen - i; \ 738 } \ 739 for (j = 0; j < mj; j += 16) { \ 740 unsigned int bl = 16; \ 741 \ 742 if (j + (unsigned long long) bl >= mj) { \ 743 bl = (unsigned int) (mj - j); \ 744 } \ 745 addmul(accum, c + i + j, bl, H); \ 746 } \ 747 } \ 748 } while(0) 749 750 #define LOOPDRMD128 \ 751 do { \ 752 const int iter = 8; \ 753 const int lb = iter * 16; \ 754 \ 755 for (i = mlen_rnd128; i < mlen; i += lb) { \ 756 CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \ 757 unsigned long long mj = lb; \ 758 \ 759 if ((i + mj) >= mlen) { \ 760 mj = mlen - i; \ 761 } \ 762 aesni_encrypt8(outni, n2, rkeys); \ 763 for (j = 0; j < mj; j++) { \ 764 m[i + j] = c[i + j] ^ outni[j]; \ 765 } \ 766 } \ 767 } while(0) 768 769 n2[3] &= 0x00ffffff; 770 771 COUNTER_INC2(n2); 772 LOOPACCUMDRND128; 773 LOOPACCUMDRMD128; 774 addmul(accum, fb, 16, H); 775 { 776 unsigned char d = 0; 777 778 for (i = 0; i < 16; i++) { 779 d |= (mac[i] ^ (T[i] ^ accum[15 - i])); 780 } 781 if (d != 0) { 782 if (m != NULL) { 783 memset(m, 0, mlen); 784 } 785 return -1; 786 } 787 if (m == NULL) { 788 return 0; 789 } 790 } 791 n2[3] = 0U; 792 COUNTER_INC2(n2); 793 LOOPDRND128; 794 LOOPDRMD128; 795 796 return 0; 797 } 798 799 int 800 crypto_aead_aes256gcm_decrypt_afternm(unsigned char *m, unsigned long long *mlen_p, 801 unsigned char *nsec, 802 const unsigned char *c, unsigned long long clen, 803 const unsigned char *ad, unsigned long long adlen, 804 const unsigned char *npub, 805 const crypto_aead_aes256gcm_state *ctx_) 806 { 807 unsigned long long mlen = 0ULL; 808 int ret = -1; 809 810 if (clen >= crypto_aead_aes256gcm_ABYTES) { 811 ret = crypto_aead_aes256gcm_decrypt_detached_afternm 812 (m, nsec, c, clen - crypto_aead_aes256gcm_ABYTES, 813 c + clen - crypto_aead_aes256gcm_ABYTES, 814 ad, adlen, npub, ctx_); 815 } 816 if (mlen_p != NULL) { 817 if (ret == 0) { 818 mlen = clen - crypto_aead_aes256gcm_ABYTES; 819 } 820 *mlen_p = mlen; 821 } 822 return ret; 823 } 824 825 int 826 crypto_aead_aes256gcm_encrypt_detached(unsigned char *c, 827 unsigned char *mac, 828 unsigned long long *maclen_p, 829 const unsigned char *m, 830 unsigned long long mlen, 831 const unsigned char *ad, 832 unsigned long long adlen, 833 const unsigned char *nsec, 834 const unsigned char *npub, 835 const unsigned char *k) 836 { 837 CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx; 838 839 crypto_aead_aes256gcm_beforenm(&ctx, k); 840 841 return crypto_aead_aes256gcm_encrypt_detached_afternm 842 (c, mac, maclen_p, m, mlen, ad, adlen, nsec, npub, 843 (const crypto_aead_aes256gcm_state *) &ctx); 844 } 845 846 int 847 crypto_aead_aes256gcm_encrypt(unsigned char *c, 848 unsigned long long *clen_p, 849 const unsigned char *m, 850 unsigned long long mlen, 851 const unsigned char *ad, 852 unsigned long long adlen, 853 const unsigned char *nsec, 854 const unsigned char *npub, 855 const unsigned char *k) 856 { 857 CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx; 858 int ret; 859 860 crypto_aead_aes256gcm_beforenm(&ctx, k); 861 862 ret = crypto_aead_aes256gcm_encrypt_afternm 863 (c, clen_p, m, mlen, ad, adlen, nsec, npub, 864 (const crypto_aead_aes256gcm_state *) &ctx); 865 sodium_memzero(ctx, sizeof ctx); 866 867 return ret; 868 } 869 870 int 871 crypto_aead_aes256gcm_decrypt_detached(unsigned char *m, 872 unsigned char *nsec, 873 const unsigned char *c, 874 unsigned long long clen, 875 const unsigned char *mac, 876 const unsigned char *ad, 877 unsigned long long adlen, 878 const unsigned char *npub, 879 const unsigned char *k) 880 { 881 CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx; 882 883 crypto_aead_aes256gcm_beforenm(&ctx, k); 884 885 return crypto_aead_aes256gcm_decrypt_detached_afternm 886 (m, nsec, c, clen, mac, ad, adlen, npub, 887 (const crypto_aead_aes256gcm_state *) &ctx); 888 } 889 890 int 891 crypto_aead_aes256gcm_decrypt(unsigned char *m, 892 unsigned long long *mlen_p, 893 unsigned char *nsec, 894 const unsigned char *c, 895 unsigned long long clen, 896 const unsigned char *ad, 897 unsigned long long adlen, 898 const unsigned char *npub, 899 const unsigned char *k) 900 { 901 CRYPTO_ALIGN(16) crypto_aead_aes256gcm_state ctx; 902 int ret; 903 904 crypto_aead_aes256gcm_beforenm(&ctx, k); 905 906 ret = crypto_aead_aes256gcm_decrypt_afternm 907 (m, mlen_p, nsec, c, clen, ad, adlen, npub, 908 (const crypto_aead_aes256gcm_state *) &ctx); 909 sodium_memzero(ctx, sizeof ctx); 910 911 return ret; 912 } 913 914 int 915 crypto_aead_aes256gcm_is_available(void) 916 { 917 return sodium_runtime_has_pclmul() & sodium_runtime_has_aesni(); 918 } 919 920 #else 921 922 int 923 crypto_aead_aes256gcm_encrypt_detached(unsigned char *c, 924 unsigned char *mac, 925 unsigned long long *maclen_p, 926 const unsigned char *m, 927 unsigned long long mlen, 928 const unsigned char *ad, 929 unsigned long long adlen, 930 const unsigned char *nsec, 931 const unsigned char *npub, 932 const unsigned char *k) 933 { 934 errno = ENOSYS; 935 return -1; 936 } 937 938 int 939 crypto_aead_aes256gcm_encrypt(unsigned char *c, unsigned long long *clen_p, 940 const unsigned char *m, unsigned long long mlen, 941 const unsigned char *ad, unsigned long long adlen, 942 const unsigned char *nsec, const unsigned char *npub, 943 const unsigned char *k) 944 { 945 errno = ENOSYS; 946 return -1; 947 } 948 949 int 950 crypto_aead_aes256gcm_decrypt_detached(unsigned char *m, 951 unsigned char *nsec, 952 const unsigned char *c, 953 unsigned long long clen, 954 const unsigned char *mac, 955 const unsigned char *ad, 956 unsigned long long adlen, 957 const unsigned char *npub, 958 const unsigned char *k) 959 { 960 errno = ENOSYS; 961 return -1; 962 } 963 964 int 965 crypto_aead_aes256gcm_decrypt(unsigned char *m, unsigned long long *mlen_p, 966 unsigned char *nsec, const unsigned char *c, 967 unsigned long long clen, const unsigned char *ad, 968 unsigned long long adlen, const unsigned char *npub, 969 const unsigned char *k) 970 { 971 errno = ENOSYS; 972 return -1; 973 } 974 975 int 976 crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *ctx_, 977 const unsigned char *k) 978 { 979 errno = ENOSYS; 980 return -1; 981 } 982 983 int 984 crypto_aead_aes256gcm_encrypt_detached_afternm(unsigned char *c, 985 unsigned char *mac, unsigned long long *maclen_p, 986 const unsigned char *m, unsigned long long mlen, 987 const unsigned char *ad, unsigned long long adlen, 988 const unsigned char *nsec, 989 const unsigned char *npub, 990 const crypto_aead_aes256gcm_state *ctx_) 991 { 992 errno = ENOSYS; 993 return -1; 994 } 995 996 int 997 crypto_aead_aes256gcm_encrypt_afternm(unsigned char *c, unsigned long long *clen_p, 998 const unsigned char *m, unsigned long long mlen, 999 const unsigned char *ad, unsigned long long adlen, 1000 const unsigned char *nsec, const unsigned char *npub, 1001 const crypto_aead_aes256gcm_state *ctx_) 1002 { 1003 errno = ENOSYS; 1004 return -1; 1005 } 1006 1007 int 1008 crypto_aead_aes256gcm_decrypt_detached_afternm(unsigned char *m, unsigned char *nsec, 1009 const unsigned char *c, unsigned long long clen, 1010 const unsigned char *mac, 1011 const unsigned char *ad, unsigned long long adlen, 1012 const unsigned char *npub, 1013 const crypto_aead_aes256gcm_state *ctx_) 1014 { 1015 errno = ENOSYS; 1016 return -1; 1017 } 1018 1019 int 1020 crypto_aead_aes256gcm_decrypt_afternm(unsigned char *m, unsigned long long *mlen_p, 1021 unsigned char *nsec, 1022 const unsigned char *c, unsigned long long clen, 1023 const unsigned char *ad, unsigned long long adlen, 1024 const unsigned char *npub, 1025 const crypto_aead_aes256gcm_state *ctx_) 1026 { 1027 errno = ENOSYS; 1028 return -1; 1029 } 1030 1031 int 1032 crypto_aead_aes256gcm_is_available(void) 1033 { 1034 return 0; 1035 } 1036 1037 #endif 1038 1039 size_t 1040 crypto_aead_aes256gcm_keybytes(void) 1041 { 1042 return crypto_aead_aes256gcm_KEYBYTES; 1043 } 1044 1045 size_t 1046 crypto_aead_aes256gcm_nsecbytes(void) 1047 { 1048 return crypto_aead_aes256gcm_NSECBYTES; 1049 } 1050 1051 size_t 1052 crypto_aead_aes256gcm_npubbytes(void) 1053 { 1054 return crypto_aead_aes256gcm_NPUBBYTES; 1055 } 1056 1057 size_t 1058 crypto_aead_aes256gcm_abytes(void) 1059 { 1060 return crypto_aead_aes256gcm_ABYTES; 1061 } 1062 1063 size_t 1064 crypto_aead_aes256gcm_statebytes(void) 1065 { 1066 return (sizeof(crypto_aead_aes256gcm_state) + (size_t) 15U) & ~(size_t) 15U; 1067 } 1068 1069 size_t 1070 crypto_aead_aes256gcm_messagebytes_max(void) 1071 { 1072 return crypto_aead_aes256gcm_MESSAGEBYTES_MAX; 1073 } 1074 1075 void 1076 crypto_aead_aes256gcm_keygen(unsigned char k[crypto_aead_aes256gcm_KEYBYTES]) 1077 { 1078 randombytes_buf(k, crypto_aead_aes256gcm_KEYBYTES); 1079 } 1080