1 2 #include <stdint.h> 3 #include <string.h> 4 5 #include "../onetimeauth_poly1305.h" 6 #include "crypto_verify_16.h" 7 #include "poly1305_sse2.h" 8 #include "private/common.h" 9 #include "private/sse2_64_32.h" 10 #include "utils.h" 11 12 #if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H) 13 14 # ifdef __GNUC__ 15 # pragma GCC target("sse2") 16 # endif 17 18 # include <emmintrin.h> 19 20 typedef __m128i xmmi; 21 22 # if defined(_MSC_VER) 23 # define POLY1305_NOINLINE __declspec(noinline) 24 # elif defined(__clang__) || defined(__GNUC__) 25 # define POLY1305_NOINLINE __attribute__((noinline)) 26 # else 27 # define POLY1305_NOINLINE 28 # endif 29 30 # define poly1305_block_size 32 31 32 enum poly1305_state_flags_t { 33 poly1305_started = 1, 34 poly1305_final_shift8 = 4, 35 poly1305_final_shift16 = 8, 36 poly1305_final_r2_r = 16, /* use [r^2,r] for the final block */ 37 poly1305_final_r_1 = 32 /* use [r,1] for the final block */ 38 }; 39 40 typedef struct poly1305_state_internal_t { 41 union { 42 uint64_t h[3]; 43 uint32_t hh[10]; 44 } H; /* 40 bytes */ 45 uint32_t R[5]; /* 20 bytes */ 46 uint32_t R2[5]; /* 20 bytes */ 47 uint32_t R4[5]; /* 20 bytes */ 48 uint64_t pad[2]; /* 16 bytes */ 49 uint64_t flags; /* 8 bytes */ 50 unsigned long long leftover; /* 8 bytes */ 51 unsigned char buffer[poly1305_block_size]; /* 32 bytes */ 52 } poly1305_state_internal_t; /* 164 bytes total */ 53 54 /* 55 * _mm_loadl_epi64() is turned into a simple MOVQ. So, unaligned accesses are 56 * totally fine, even though this intrinsic requires a __m128i* input. 57 * This confuses dynamic analysis, so force alignment, only in debug mode. 58 */ 59 # ifdef DEBUG 60 static xmmi 61 _fakealign_mm_loadl_epi64(const void *m) 62 { 63 xmmi tmp; 64 memcpy(&tmp, m, 8); 65 66 return _mm_loadl_epi64(&tmp); 67 } 68 # define _mm_loadl_epi64(X) _fakealign_mm_loadl_epi64(X) 69 #endif 70 71 /* copy 0-31 bytes */ 72 static inline void 73 poly1305_block_copy31(unsigned char *dst, const unsigned char *src, 74 unsigned long long bytes) 75 { 76 if (bytes & 16) { 77 _mm_store_si128((xmmi *) (void *) dst, 78 _mm_loadu_si128((const xmmi *) (const void *) src)); 79 src += 16; 80 dst += 16; 81 } 82 if (bytes & 8) { 83 memcpy(dst, src, 8); 84 src += 8; 85 dst += 8; 86 } 87 if (bytes & 4) { 88 memcpy(dst, src, 4); 89 src += 4; 90 dst += 4; 91 } 92 if (bytes & 2) { 93 memcpy(dst, src, 2); 94 src += 2; 95 dst += 2; 96 } 97 if (bytes & 1) { 98 *dst = *src; 99 } 100 } 101 102 static POLY1305_NOINLINE void 103 poly1305_init_ext(poly1305_state_internal_t *st, const unsigned char key[32], 104 unsigned long long bytes) 105 { 106 uint32_t *R; 107 uint128_t d[3]; 108 uint64_t r0, r1, r2; 109 uint64_t rt0, rt1, rt2, st2, c; 110 uint64_t t0, t1; 111 unsigned long long i; 112 113 if (!bytes) { 114 bytes = ~(unsigned long long) 0; 115 } 116 /* H = 0 */ 117 _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], _mm_setzero_si128()); 118 _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], _mm_setzero_si128()); 119 _mm_storeu_si128((xmmi *) (void *) &st->H.hh[8], _mm_setzero_si128()); 120 121 /* clamp key */ 122 memcpy(&t0, key, 8); 123 memcpy(&t1, key + 8, 8); 124 r0 = t0 & 0xffc0fffffff; 125 t0 >>= 44; 126 t0 |= t1 << 20; 127 r1 = t0 & 0xfffffc0ffff; 128 t1 >>= 24; 129 r2 = t1 & 0x00ffffffc0f; 130 131 /* r^1 */ 132 R = st->R; 133 R[0] = (uint32_t)(r0) &0x3ffffff; 134 R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; 135 R[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff; 136 R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; 137 R[4] = (uint32_t)((r2 >> 16)); 138 139 /* save pad */ 140 memcpy(&st->pad[0], key + 16, 8); 141 memcpy(&st->pad[1], key + 24, 8); 142 143 rt0 = r0; 144 rt1 = r1; 145 rt2 = r2; 146 147 /* r^2, r^4 */ 148 for (i = 0; i < 2; i++) { 149 if (i == 0) { 150 R = st->R2; 151 if (bytes <= 16) { 152 break; 153 } 154 } else if (i == 1) { 155 R = st->R4; 156 if (bytes < 96) { 157 break; 158 } 159 } 160 st2 = rt2 * (5 << 2); 161 162 d[0] = ((uint128_t) rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2); 163 d[1] = ((uint128_t) rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1); 164 d[2] = ((uint128_t) rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0); 165 166 rt0 = (uint64_t) d[0] & 0xfffffffffff; 167 c = (uint64_t)(d[0] >> 44); 168 d[1] += c; 169 170 rt1 = (uint64_t) d[1] & 0xfffffffffff; 171 c = (uint64_t)(d[1] >> 44); 172 d[2] += c; 173 174 rt2 = (uint64_t) d[2] & 0x3ffffffffff; 175 c = (uint64_t)(d[2] >> 42); 176 rt0 += c * 5; 177 c = (rt0 >> 44); 178 rt0 = rt0 & 0xfffffffffff; 179 rt1 += c; 180 c = (rt1 >> 44); 181 rt1 = rt1 & 0xfffffffffff; 182 rt2 += c; /* even if rt2 overflows, it will still fit in rp4 safely, and 183 is safe to multiply with */ 184 185 R[0] = (uint32_t)(rt0) &0x3ffffff; 186 R[1] = (uint32_t)((rt0 >> 26) | (rt1 << 18)) & 0x3ffffff; 187 R[2] = (uint32_t)((rt1 >> 8)) & 0x3ffffff; 188 R[3] = (uint32_t)((rt1 >> 34) | (rt2 << 10)) & 0x3ffffff; 189 R[4] = (uint32_t)((rt2 >> 16)); 190 } 191 st->flags = 0; 192 st->leftover = 0U; 193 } 194 195 static POLY1305_NOINLINE void 196 poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, 197 unsigned long long bytes) 198 { 199 CRYPTO_ALIGN(64) 200 xmmi HIBIT = 201 _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1, 0, 1, 0)); 202 const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1), 203 _MM_SHUFFLE(1, 0, 1, 0)); 204 const xmmi FIVE = 205 _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1, 0, 1, 0)); 206 xmmi H0, H1, H2, H3, H4; 207 xmmi T0, T1, T2, T3, T4, T5, T6, T7, T8; 208 xmmi M0, M1, M2, M3, M4; 209 xmmi M5, M6, M7, M8; 210 xmmi C1, C2; 211 xmmi R20, R21, R22, R23, R24, S21, S22, S23, S24; 212 xmmi R40, R41, R42, R43, R44, S41, S42, S43, S44; 213 214 if (st->flags & poly1305_final_shift8) { 215 HIBIT = _mm_srli_si128(HIBIT, 8); 216 } 217 if (st->flags & poly1305_final_shift16) { 218 HIBIT = _mm_setzero_si128(); 219 } 220 if (!(st->flags & poly1305_started)) { 221 /* H = [Mx,My] */ 222 T5 = _mm_unpacklo_epi64( 223 _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)), 224 _mm_loadl_epi64((const xmmi *) (const void *) (m + 16))); 225 T6 = _mm_unpacklo_epi64( 226 _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)), 227 _mm_loadl_epi64((const xmmi *) (const void *) (m + 24))); 228 H0 = _mm_and_si128(MMASK, T5); 229 H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 230 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 231 H2 = _mm_and_si128(MMASK, T5); 232 H3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 233 H4 = _mm_srli_epi64(T6, 40); 234 H4 = _mm_or_si128(H4, HIBIT); 235 m += 32; 236 bytes -= 32; 237 st->flags |= poly1305_started; 238 } else { 239 T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[0]); 240 T1 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[4]); 241 T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[8]); 242 H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 0, 0)); 243 H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 2, 2)); 244 H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1, 1, 0, 0)); 245 H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 3, 2, 2)); 246 H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1, 1, 0, 0)); 247 } 248 if (st->flags & (poly1305_final_r2_r | poly1305_final_r_1)) { 249 if (st->flags & poly1305_final_r2_r) { 250 /* use [r^2, r] */ 251 T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]); 252 T3 = _mm_cvtsi32_si128(st->R[4]); 253 T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]); 254 T1 = _mm_cvtsi32_si128(st->R2[4]); 255 T4 = _mm_unpacklo_epi32(T0, T2); 256 T5 = _mm_unpackhi_epi32(T0, T2); 257 R24 = _mm_unpacklo_epi64(T1, T3); 258 } else { 259 /* use [r^1, 1] */ 260 T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]); 261 T1 = _mm_cvtsi32_si128(st->R[4]); 262 T2 = _mm_cvtsi32_si128(1); 263 T4 = _mm_unpacklo_epi32(T0, T2); 264 T5 = _mm_unpackhi_epi32(T0, T2); 265 R24 = T1; 266 } 267 R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1, 1, 0, 0)); 268 R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3, 3, 2, 2)); 269 R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1, 1, 0, 0)); 270 R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3, 3, 2, 2)); 271 } else { 272 /* use [r^2, r^2] */ 273 T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]); 274 T1 = _mm_cvtsi32_si128(st->R2[4]); 275 R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0)); 276 R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1)); 277 R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2)); 278 R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3)); 279 R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0)); 280 } 281 S21 = _mm_mul_epu32(R21, FIVE); 282 S22 = _mm_mul_epu32(R22, FIVE); 283 S23 = _mm_mul_epu32(R23, FIVE); 284 S24 = _mm_mul_epu32(R24, FIVE); 285 286 if (bytes >= 64) { 287 T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R4[0]); 288 T1 = _mm_cvtsi32_si128(st->R4[4]); 289 R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0)); 290 R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1)); 291 R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2)); 292 R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3)); 293 R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0)); 294 S41 = _mm_mul_epu32(R41, FIVE); 295 S42 = _mm_mul_epu32(R42, FIVE); 296 S43 = _mm_mul_epu32(R43, FIVE); 297 S44 = _mm_mul_epu32(R44, FIVE); 298 299 while (bytes >= 64) { 300 xmmi v00, v01, v02, v03, v04; 301 xmmi v10, v11, v12, v13, v14; 302 xmmi v20, v21, v22, v23, v24; 303 xmmi v30, v31, v32, v33, v34; 304 xmmi v40, v41, v42, v43, v44; 305 xmmi T14, T15; 306 307 /* H *= [r^4,r^4], preload [Mx,My] */ 308 T15 = S42; 309 T0 = H4; 310 T0 = _mm_mul_epu32(T0, S41); 311 v01 = H3; 312 v01 = _mm_mul_epu32(v01, T15); 313 T14 = S43; 314 T1 = H4; 315 T1 = _mm_mul_epu32(T1, T15); 316 v11 = H3; 317 v11 = _mm_mul_epu32(v11, T14); 318 T2 = H4; 319 T2 = _mm_mul_epu32(T2, T14); 320 T0 = _mm_add_epi64(T0, v01); 321 T15 = S44; 322 v02 = H2; 323 v02 = _mm_mul_epu32(v02, T14); 324 T3 = H4; 325 T3 = _mm_mul_epu32(T3, T15); 326 T1 = _mm_add_epi64(T1, v11); 327 v03 = H1; 328 v03 = _mm_mul_epu32(v03, T15); 329 v12 = H2; 330 v12 = _mm_mul_epu32(v12, T15); 331 T0 = _mm_add_epi64(T0, v02); 332 T14 = R40; 333 v21 = H3; 334 v21 = _mm_mul_epu32(v21, T15); 335 v31 = H3; 336 v31 = _mm_mul_epu32(v31, T14); 337 T0 = _mm_add_epi64(T0, v03); 338 T4 = H4; 339 T4 = _mm_mul_epu32(T4, T14); 340 T1 = _mm_add_epi64(T1, v12); 341 v04 = H0; 342 v04 = _mm_mul_epu32(v04, T14); 343 T2 = _mm_add_epi64(T2, v21); 344 v13 = H1; 345 v13 = _mm_mul_epu32(v13, T14); 346 T3 = _mm_add_epi64(T3, v31); 347 T15 = R41; 348 v22 = H2; 349 v22 = _mm_mul_epu32(v22, T14); 350 v32 = H2; 351 v32 = _mm_mul_epu32(v32, T15); 352 T0 = _mm_add_epi64(T0, v04); 353 v41 = H3; 354 v41 = _mm_mul_epu32(v41, T15); 355 T1 = _mm_add_epi64(T1, v13); 356 v14 = H0; 357 v14 = _mm_mul_epu32(v14, T15); 358 T2 = _mm_add_epi64(T2, v22); 359 T14 = R42; 360 T5 = _mm_unpacklo_epi64( 361 _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)), 362 _mm_loadl_epi64((const xmmi *) (const void *) (m + 16))); 363 v23 = H1; 364 v23 = _mm_mul_epu32(v23, T15); 365 T3 = _mm_add_epi64(T3, v32); 366 v33 = H1; 367 v33 = _mm_mul_epu32(v33, T14); 368 T4 = _mm_add_epi64(T4, v41); 369 v42 = H2; 370 v42 = _mm_mul_epu32(v42, T14); 371 T1 = _mm_add_epi64(T1, v14); 372 T15 = R43; 373 T6 = _mm_unpacklo_epi64( 374 _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)), 375 _mm_loadl_epi64((const xmmi *) (const void *) (m + 24))); 376 v24 = H0; 377 v24 = _mm_mul_epu32(v24, T14); 378 T2 = _mm_add_epi64(T2, v23); 379 v34 = H0; 380 v34 = _mm_mul_epu32(v34, T15); 381 T3 = _mm_add_epi64(T3, v33); 382 M0 = _mm_and_si128(MMASK, T5); 383 v43 = H1; 384 v43 = _mm_mul_epu32(v43, T15); 385 T4 = _mm_add_epi64(T4, v42); 386 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); 387 v44 = H0; 388 v44 = _mm_mul_epu32(v44, R44); 389 T2 = _mm_add_epi64(T2, v24); 390 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); 391 T3 = _mm_add_epi64(T3, v34); 392 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14)); 393 T4 = _mm_add_epi64(T4, v43); 394 M2 = _mm_and_si128(MMASK, T5); 395 T4 = _mm_add_epi64(T4, v44); 396 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); 397 398 /* H += [Mx',My'] */ 399 T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 32)); 400 T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 48)); 401 T7 = _mm_unpacklo_epi32(T5, T6); 402 T8 = _mm_unpackhi_epi32(T5, T6); 403 M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128()); 404 M6 = _mm_unpackhi_epi32(T7, _mm_setzero_si128()); 405 M7 = _mm_unpacklo_epi32(T8, _mm_setzero_si128()); 406 M8 = _mm_unpackhi_epi32(T8, _mm_setzero_si128()); 407 M6 = _mm_slli_epi64(M6, 6); 408 M7 = _mm_slli_epi64(M7, 12); 409 M8 = _mm_slli_epi64(M8, 18); 410 T0 = _mm_add_epi64(T0, M5); 411 T1 = _mm_add_epi64(T1, M6); 412 T2 = _mm_add_epi64(T2, M7); 413 T3 = _mm_add_epi64(T3, M8); 414 T4 = _mm_add_epi64(T4, HIBIT); 415 416 /* H += [Mx,My]*[r^2,r^2] */ 417 T15 = S22; 418 v00 = M4; 419 v00 = _mm_mul_epu32(v00, S21); 420 v01 = M3; 421 v01 = _mm_mul_epu32(v01, T15); 422 T14 = S23; 423 v10 = M4; 424 v10 = _mm_mul_epu32(v10, T15); 425 v11 = M3; 426 v11 = _mm_mul_epu32(v11, T14); 427 T0 = _mm_add_epi64(T0, v00); 428 v20 = M4; 429 v20 = _mm_mul_epu32(v20, T14); 430 T0 = _mm_add_epi64(T0, v01); 431 T15 = S24; 432 v02 = M2; 433 v02 = _mm_mul_epu32(v02, T14); 434 T1 = _mm_add_epi64(T1, v10); 435 v30 = M4; 436 v30 = _mm_mul_epu32(v30, T15); 437 T1 = _mm_add_epi64(T1, v11); 438 v03 = M1; 439 v03 = _mm_mul_epu32(v03, T15); 440 T2 = _mm_add_epi64(T2, v20); 441 v12 = M2; 442 v12 = _mm_mul_epu32(v12, T15); 443 T0 = _mm_add_epi64(T0, v02); 444 T14 = R20; 445 v21 = M3; 446 v21 = _mm_mul_epu32(v21, T15); 447 T3 = _mm_add_epi64(T3, v30); 448 v31 = M3; 449 v31 = _mm_mul_epu32(v31, T14); 450 T0 = _mm_add_epi64(T0, v03); 451 v40 = M4; 452 v40 = _mm_mul_epu32(v40, T14); 453 T1 = _mm_add_epi64(T1, v12); 454 v04 = M0; 455 v04 = _mm_mul_epu32(v04, T14); 456 T2 = _mm_add_epi64(T2, v21); 457 v13 = M1; 458 v13 = _mm_mul_epu32(v13, T14); 459 T3 = _mm_add_epi64(T3, v31); 460 T15 = R21; 461 v22 = M2; 462 v22 = _mm_mul_epu32(v22, T14); 463 T4 = _mm_add_epi64(T4, v40); 464 v32 = M2; 465 v32 = _mm_mul_epu32(v32, T15); 466 T0 = _mm_add_epi64(T0, v04); 467 v41 = M3; 468 v41 = _mm_mul_epu32(v41, T15); 469 T1 = _mm_add_epi64(T1, v13); 470 v14 = M0; 471 v14 = _mm_mul_epu32(v14, T15); 472 T2 = _mm_add_epi64(T2, v22); 473 T14 = R22; 474 v23 = M1; 475 v23 = _mm_mul_epu32(v23, T15); 476 T3 = _mm_add_epi64(T3, v32); 477 v33 = M1; 478 v33 = _mm_mul_epu32(v33, T14); 479 T4 = _mm_add_epi64(T4, v41); 480 v42 = M2; 481 v42 = _mm_mul_epu32(v42, T14); 482 T1 = _mm_add_epi64(T1, v14); 483 T15 = R23; 484 v24 = M0; 485 v24 = _mm_mul_epu32(v24, T14); 486 T2 = _mm_add_epi64(T2, v23); 487 v34 = M0; 488 v34 = _mm_mul_epu32(v34, T15); 489 T3 = _mm_add_epi64(T3, v33); 490 v43 = M1; 491 v43 = _mm_mul_epu32(v43, T15); 492 T4 = _mm_add_epi64(T4, v42); 493 v44 = M0; 494 v44 = _mm_mul_epu32(v44, R24); 495 T2 = _mm_add_epi64(T2, v24); 496 T3 = _mm_add_epi64(T3, v34); 497 T4 = _mm_add_epi64(T4, v43); 498 T4 = _mm_add_epi64(T4, v44); 499 500 /* reduce */ 501 C1 = _mm_srli_epi64(T0, 26); 502 C2 = _mm_srli_epi64(T3, 26); 503 T0 = _mm_and_si128(T0, MMASK); 504 T3 = _mm_and_si128(T3, MMASK); 505 T1 = _mm_add_epi64(T1, C1); 506 T4 = _mm_add_epi64(T4, C2); 507 C1 = _mm_srli_epi64(T1, 26); 508 C2 = _mm_srli_epi64(T4, 26); 509 T1 = _mm_and_si128(T1, MMASK); 510 T4 = _mm_and_si128(T4, MMASK); 511 T2 = _mm_add_epi64(T2, C1); 512 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); 513 C1 = _mm_srli_epi64(T2, 26); 514 C2 = _mm_srli_epi64(T0, 26); 515 T2 = _mm_and_si128(T2, MMASK); 516 T0 = _mm_and_si128(T0, MMASK); 517 T3 = _mm_add_epi64(T3, C1); 518 T1 = _mm_add_epi64(T1, C2); 519 C1 = _mm_srli_epi64(T3, 26); 520 T3 = _mm_and_si128(T3, MMASK); 521 T4 = _mm_add_epi64(T4, C1); 522 523 /* Final: H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx',My']) */ 524 H0 = T0; 525 H1 = T1; 526 H2 = T2; 527 H3 = T3; 528 H4 = T4; 529 530 m += 64; 531 bytes -= 64; 532 } 533 } 534 535 if (bytes >= 32) { 536 xmmi v01, v02, v03, v04; 537 xmmi v11, v12, v13, v14; 538 xmmi v21, v22, v23, v24; 539 xmmi v31, v32, v33, v34; 540 xmmi v41, v42, v43, v44; 541 xmmi T14, T15; 542 543 /* H *= [r^2,r^2] */ 544 T15 = S22; 545 T0 = H4; 546 T0 = _mm_mul_epu32(T0, S21); 547 v01 = H3; 548 v01 = _mm_mul_epu32(v01, T15); 549 T14 = S23; 550 T1 = H4; 551 T1 = _mm_mul_epu32(T1, T15); 552 v11 = H3; 553 v11 = _mm_mul_epu32(v11, T14); 554 T2 = H4; 555 T2 = _mm_mul_epu32(T2, T14); 556 T0 = _mm_add_epi64(T0, v01); 557 T15 = S24; 558 v02 = H2; 559 v02 = _mm_mul_epu32(v02, T14); 560 T3 = H4; 561 T3 = _mm_mul_epu32(T3, T15); 562 T1 = _mm_add_epi64(T1, v11); 563 v03 = H1; 564 v03 = _mm_mul_epu32(v03, T15); 565 v12 = H2; 566 v12 = _mm_mul_epu32(v12, T15); 567 T0 = _mm_add_epi64(T0, v02); 568 T14 = R20; 569 v21 = H3; 570 v21 = _mm_mul_epu32(v21, T15); 571 v31 = H3; 572 v31 = _mm_mul_epu32(v31, T14); 573 T0 = _mm_add_epi64(T0, v03); 574 T4 = H4; 575 T4 = _mm_mul_epu32(T4, T14); 576 T1 = _mm_add_epi64(T1, v12); 577 v04 = H0; 578 v04 = _mm_mul_epu32(v04, T14); 579 T2 = _mm_add_epi64(T2, v21); 580 v13 = H1; 581 v13 = _mm_mul_epu32(v13, T14); 582 T3 = _mm_add_epi64(T3, v31); 583 T15 = R21; 584 v22 = H2; 585 v22 = _mm_mul_epu32(v22, T14); 586 v32 = H2; 587 v32 = _mm_mul_epu32(v32, T15); 588 T0 = _mm_add_epi64(T0, v04); 589 v41 = H3; 590 v41 = _mm_mul_epu32(v41, T15); 591 T1 = _mm_add_epi64(T1, v13); 592 v14 = H0; 593 v14 = _mm_mul_epu32(v14, T15); 594 T2 = _mm_add_epi64(T2, v22); 595 T14 = R22; 596 v23 = H1; 597 v23 = _mm_mul_epu32(v23, T15); 598 T3 = _mm_add_epi64(T3, v32); 599 v33 = H1; 600 v33 = _mm_mul_epu32(v33, T14); 601 T4 = _mm_add_epi64(T4, v41); 602 v42 = H2; 603 v42 = _mm_mul_epu32(v42, T14); 604 T1 = _mm_add_epi64(T1, v14); 605 T15 = R23; 606 v24 = H0; 607 v24 = _mm_mul_epu32(v24, T14); 608 T2 = _mm_add_epi64(T2, v23); 609 v34 = H0; 610 v34 = _mm_mul_epu32(v34, T15); 611 T3 = _mm_add_epi64(T3, v33); 612 v43 = H1; 613 v43 = _mm_mul_epu32(v43, T15); 614 T4 = _mm_add_epi64(T4, v42); 615 v44 = H0; 616 v44 = _mm_mul_epu32(v44, R24); 617 T2 = _mm_add_epi64(T2, v24); 618 T3 = _mm_add_epi64(T3, v34); 619 T4 = _mm_add_epi64(T4, v43); 620 T4 = _mm_add_epi64(T4, v44); 621 622 /* H += [Mx,My] */ 623 if (m) { 624 T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 0)); 625 T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 16)); 626 T7 = _mm_unpacklo_epi32(T5, T6); 627 T8 = _mm_unpackhi_epi32(T5, T6); 628 M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128()); 629 M1 = _mm_unpackhi_epi32(T7, _mm_setzero_si128()); 630 M2 = _mm_unpacklo_epi32(T8, _mm_setzero_si128()); 631 M3 = _mm_unpackhi_epi32(T8, _mm_setzero_si128()); 632 M1 = _mm_slli_epi64(M1, 6); 633 M2 = _mm_slli_epi64(M2, 12); 634 M3 = _mm_slli_epi64(M3, 18); 635 T0 = _mm_add_epi64(T0, M0); 636 T1 = _mm_add_epi64(T1, M1); 637 T2 = _mm_add_epi64(T2, M2); 638 T3 = _mm_add_epi64(T3, M3); 639 T4 = _mm_add_epi64(T4, HIBIT); 640 } 641 642 /* reduce */ 643 C1 = _mm_srli_epi64(T0, 26); 644 C2 = _mm_srli_epi64(T3, 26); 645 T0 = _mm_and_si128(T0, MMASK); 646 T3 = _mm_and_si128(T3, MMASK); 647 T1 = _mm_add_epi64(T1, C1); 648 T4 = _mm_add_epi64(T4, C2); 649 C1 = _mm_srli_epi64(T1, 26); 650 C2 = _mm_srli_epi64(T4, 26); 651 T1 = _mm_and_si128(T1, MMASK); 652 T4 = _mm_and_si128(T4, MMASK); 653 T2 = _mm_add_epi64(T2, C1); 654 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); 655 C1 = _mm_srli_epi64(T2, 26); 656 C2 = _mm_srli_epi64(T0, 26); 657 T2 = _mm_and_si128(T2, MMASK); 658 T0 = _mm_and_si128(T0, MMASK); 659 T3 = _mm_add_epi64(T3, C1); 660 T1 = _mm_add_epi64(T1, C2); 661 C1 = _mm_srli_epi64(T3, 26); 662 T3 = _mm_and_si128(T3, MMASK); 663 T4 = _mm_add_epi64(T4, C1); 664 665 /* H = (H*[r^2,r^2] + [Mx,My]) */ 666 H0 = T0; 667 H1 = T1; 668 H2 = T2; 669 H3 = T3; 670 H4 = T4; 671 } 672 673 if (m) { 674 T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0, 0, 2, 0)); 675 T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0, 0, 2, 0)); 676 T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0, 0, 2, 0)); 677 T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0, 0, 2, 0)); 678 T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0, 0, 2, 0)); 679 T0 = _mm_unpacklo_epi64(T0, T1); 680 T1 = _mm_unpacklo_epi64(T2, T3); 681 _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], T0); 682 _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], T1); 683 _mm_storel_epi64((xmmi *) (void *) &st->H.hh[8], T4); 684 } else { 685 uint32_t t0, t1, t2, t3, t4, b; 686 uint64_t h0, h1, h2, g0, g1, g2, c, nc; 687 688 /* H = H[0]+H[1] */ 689 T0 = H0; 690 T1 = H1; 691 T2 = H2; 692 T3 = H3; 693 T4 = H4; 694 695 T0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8)); 696 T1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8)); 697 T2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8)); 698 T3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8)); 699 T4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8)); 700 701 t0 = _mm_cvtsi128_si32(T0); 702 b = (t0 >> 26); 703 t0 &= 0x3ffffff; 704 t1 = _mm_cvtsi128_si32(T1) + b; 705 b = (t1 >> 26); 706 t1 &= 0x3ffffff; 707 t2 = _mm_cvtsi128_si32(T2) + b; 708 b = (t2 >> 26); 709 t2 &= 0x3ffffff; 710 t3 = _mm_cvtsi128_si32(T3) + b; 711 b = (t3 >> 26); 712 t3 &= 0x3ffffff; 713 t4 = _mm_cvtsi128_si32(T4) + b; 714 715 /* everything except t4 is in range, so this is all safe */ 716 h0 = (((uint64_t) t0) | ((uint64_t) t1 << 26)) & 0xfffffffffffull; 717 h1 = (((uint64_t) t1 >> 18) | ((uint64_t) t2 << 8) | 718 ((uint64_t) t3 << 34)) & 719 0xfffffffffffull; 720 h2 = (((uint64_t) t3 >> 10) | ((uint64_t) t4 << 16)); 721 722 c = (h2 >> 42); 723 h2 &= 0x3ffffffffff; 724 h0 += c * 5; 725 c = (h0 >> 44); 726 h0 &= 0xfffffffffff; 727 h1 += c; 728 c = (h1 >> 44); 729 h1 &= 0xfffffffffff; 730 h2 += c; 731 c = (h2 >> 42); 732 h2 &= 0x3ffffffffff; 733 h0 += c * 5; 734 c = (h0 >> 44); 735 h0 &= 0xfffffffffff; 736 h1 += c; 737 738 g0 = h0 + 5; 739 c = (g0 >> 44); 740 g0 &= 0xfffffffffff; 741 g1 = h1 + c; 742 c = (g1 >> 44); 743 g1 &= 0xfffffffffff; 744 g2 = h2 + c - ((uint64_t) 1 << 42); 745 746 c = (g2 >> 63) - 1; 747 nc = ~c; 748 h0 = (h0 & nc) | (g0 & c); 749 h1 = (h1 & nc) | (g1 & c); 750 h2 = (h2 & nc) | (g2 & c); 751 752 st->H.h[0] = h0; 753 st->H.h[1] = h1; 754 st->H.h[2] = h2; 755 } 756 } 757 758 static void 759 poly1305_update(poly1305_state_internal_t *st, const unsigned char *m, 760 unsigned long long bytes) 761 { 762 unsigned long long i; 763 764 /* handle leftover */ 765 if (st->leftover) { 766 unsigned long long want = (poly1305_block_size - st->leftover); 767 768 if (want > bytes) { 769 want = bytes; 770 } 771 for (i = 0; i < want; i++) { 772 st->buffer[st->leftover + i] = m[i]; 773 } 774 bytes -= want; 775 m += want; 776 st->leftover += want; 777 if (st->leftover < poly1305_block_size) { 778 return; 779 } 780 poly1305_blocks(st, st->buffer, poly1305_block_size); 781 st->leftover = 0; 782 } 783 784 /* process full blocks */ 785 if (bytes >= poly1305_block_size) { 786 unsigned long long want = (bytes & ~(poly1305_block_size - 1)); 787 788 poly1305_blocks(st, m, want); 789 m += want; 790 bytes -= want; 791 } 792 793 /* store leftover */ 794 if (bytes) { 795 for (i = 0; i < bytes; i++) { 796 st->buffer[st->leftover + i] = m[i]; 797 } 798 st->leftover += bytes; 799 } 800 } 801 802 static POLY1305_NOINLINE void 803 poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m, 804 unsigned long long leftover, unsigned char mac[16]) 805 { 806 uint64_t h0, h1, h2; 807 808 if (leftover) { 809 CRYPTO_ALIGN(16) unsigned char final[32] = { 0 }; 810 811 poly1305_block_copy31(final, m, leftover); 812 if (leftover != 16) { 813 final[leftover] = 1; 814 } 815 st->flags |= 816 (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16; 817 poly1305_blocks(st, final, 32); 818 } 819 820 if (st->flags & poly1305_started) { 821 /* finalize, H *= [r^2,r], or H *= [r,1] */ 822 if (!leftover || (leftover > 16)) { 823 st->flags |= poly1305_final_r2_r; 824 } else { 825 st->flags |= poly1305_final_r_1; 826 } 827 poly1305_blocks(st, NULL, 32); 828 } 829 830 h0 = st->H.h[0]; 831 h1 = st->H.h[1]; 832 h2 = st->H.h[2]; 833 834 /* pad */ 835 h0 = ((h0) | (h1 << 44)); 836 h1 = ((h1 >> 20) | (h2 << 24)); 837 #ifdef HAVE_AMD64_ASM 838 __asm__ __volatile__( 839 "addq %2, %0 ;\n" 840 "adcq %3, %1 ;\n" 841 : "+r"(h0), "+r"(h1) 842 : "r"(st->pad[0]), "r"(st->pad[1]) 843 : "flags", "cc"); 844 #else 845 { 846 uint128_t h; 847 848 memcpy(&h, &st->pad[0], 16); 849 h += ((uint128_t) h1 << 64) | h0; 850 h0 = (uint64_t) h; 851 h1 = (uint64_t)(h >> 64); 852 } 853 #endif 854 _mm_storeu_si128((xmmi *) (void *) st + 0, _mm_setzero_si128()); 855 _mm_storeu_si128((xmmi *) (void *) st + 1, _mm_setzero_si128()); 856 _mm_storeu_si128((xmmi *) (void *) st + 2, _mm_setzero_si128()); 857 _mm_storeu_si128((xmmi *) (void *) st + 3, _mm_setzero_si128()); 858 _mm_storeu_si128((xmmi *) (void *) st + 4, _mm_setzero_si128()); 859 _mm_storeu_si128((xmmi *) (void *) st + 5, _mm_setzero_si128()); 860 _mm_storeu_si128((xmmi *) (void *) st + 6, _mm_setzero_si128()); 861 _mm_storeu_si128((xmmi *) (void *) st + 7, _mm_setzero_si128()); 862 863 memcpy(&mac[0], &h0, 8); 864 memcpy(&mac[8], &h1, 8); 865 866 sodium_memzero((void *) st, sizeof *st); 867 } 868 869 static void 870 poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16]) 871 { 872 poly1305_finish_ext(st, st->buffer, st->leftover, mac); 873 } 874 875 static int 876 crypto_onetimeauth_poly1305_sse2_init(crypto_onetimeauth_poly1305_state *state, 877 const unsigned char *key) 878 { 879 COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >= 880 sizeof(poly1305_state_internal_t)); 881 poly1305_init_ext((poly1305_state_internal_t *) (void *) state, key, 0U); 882 883 return 0; 884 } 885 886 static int 887 crypto_onetimeauth_poly1305_sse2_update( 888 crypto_onetimeauth_poly1305_state *state, const unsigned char *in, 889 unsigned long long inlen) 890 { 891 poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen); 892 893 return 0; 894 } 895 896 static int 897 crypto_onetimeauth_poly1305_sse2_final(crypto_onetimeauth_poly1305_state *state, 898 unsigned char *out) 899 { 900 poly1305_finish((poly1305_state_internal_t *) (void *) state, out); 901 902 return 0; 903 } 904 905 static int 906 crypto_onetimeauth_poly1305_sse2(unsigned char *out, const unsigned char *m, 907 unsigned long long inlen, 908 const unsigned char *key) 909 { 910 CRYPTO_ALIGN(64) poly1305_state_internal_t st; 911 unsigned long long blocks; 912 913 poly1305_init_ext(&st, key, inlen); 914 blocks = inlen & ~31; 915 if (blocks > 0) { 916 poly1305_blocks(&st, m, blocks); 917 m += blocks; 918 inlen -= blocks; 919 } 920 poly1305_finish_ext(&st, m, inlen, out); 921 922 return 0; 923 } 924 925 static int 926 crypto_onetimeauth_poly1305_sse2_verify(const unsigned char *h, 927 const unsigned char *in, 928 unsigned long long inlen, 929 const unsigned char *k) 930 { 931 unsigned char correct[16]; 932 933 crypto_onetimeauth_poly1305_sse2(correct, in, inlen, k); 934 935 return crypto_verify_16(h, correct); 936 } 937 938 struct crypto_onetimeauth_poly1305_implementation 939 crypto_onetimeauth_poly1305_sse2_implementation = { 940 SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_sse2, 941 SODIUM_C99(.onetimeauth_verify =) 942 crypto_onetimeauth_poly1305_sse2_verify, 943 SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_sse2_init, 944 SODIUM_C99(.onetimeauth_update =) 945 crypto_onetimeauth_poly1305_sse2_update, 946 SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_sse2_final 947 }; 948 949 #endif 950