10ac341f1SConrad Meyer 
20ac341f1SConrad Meyer #include <stdint.h>
30ac341f1SConrad Meyer #include <string.h>
40ac341f1SConrad Meyer 
50ac341f1SConrad Meyer #include "../onetimeauth_poly1305.h"
60ac341f1SConrad Meyer #include "crypto_verify_16.h"
70ac341f1SConrad Meyer #include "poly1305_sse2.h"
80ac341f1SConrad Meyer #include "private/common.h"
90ac341f1SConrad Meyer #include "private/sse2_64_32.h"
100ac341f1SConrad Meyer #include "utils.h"
110ac341f1SConrad Meyer 
120ac341f1SConrad Meyer #if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H)
130ac341f1SConrad Meyer 
140ac341f1SConrad Meyer # ifdef __GNUC__
150ac341f1SConrad Meyer #  pragma GCC target("sse2")
160ac341f1SConrad Meyer # endif
170ac341f1SConrad Meyer 
180ac341f1SConrad Meyer # include <emmintrin.h>
190ac341f1SConrad Meyer 
200ac341f1SConrad Meyer typedef __m128i xmmi;
210ac341f1SConrad Meyer 
220ac341f1SConrad Meyer # if defined(_MSC_VER)
230ac341f1SConrad Meyer #  define POLY1305_NOINLINE __declspec(noinline)
240ac341f1SConrad Meyer # elif defined(__clang__) || defined(__GNUC__)
250ac341f1SConrad Meyer #  define POLY1305_NOINLINE __attribute__((noinline))
260ac341f1SConrad Meyer # else
270ac341f1SConrad Meyer #  define POLY1305_NOINLINE
280ac341f1SConrad Meyer # endif
290ac341f1SConrad Meyer 
300ac341f1SConrad Meyer # define poly1305_block_size 32
310ac341f1SConrad Meyer 
320ac341f1SConrad Meyer enum poly1305_state_flags_t {
330ac341f1SConrad Meyer     poly1305_started       = 1,
340ac341f1SConrad Meyer     poly1305_final_shift8  = 4,
350ac341f1SConrad Meyer     poly1305_final_shift16 = 8,
360ac341f1SConrad Meyer     poly1305_final_r2_r    = 16, /* use [r^2,r] for the final block */
370ac341f1SConrad Meyer     poly1305_final_r_1     = 32  /* use [r,1] for the final block */
380ac341f1SConrad Meyer };
390ac341f1SConrad Meyer 
400ac341f1SConrad Meyer typedef struct poly1305_state_internal_t {
410ac341f1SConrad Meyer     union {
420ac341f1SConrad Meyer         uint64_t h[3];
430ac341f1SConrad Meyer         uint32_t hh[10];
440ac341f1SConrad Meyer     } H;                                            /*  40 bytes  */
450ac341f1SConrad Meyer     uint32_t           R[5];                        /*  20 bytes  */
460ac341f1SConrad Meyer     uint32_t           R2[5];                       /*  20 bytes  */
470ac341f1SConrad Meyer     uint32_t           R4[5];                       /*  20 bytes  */
480ac341f1SConrad Meyer     uint64_t           pad[2];                      /*  16 bytes  */
490ac341f1SConrad Meyer     uint64_t           flags;                       /*   8 bytes  */
500ac341f1SConrad Meyer     unsigned long long leftover;                    /* 8 bytes */
510ac341f1SConrad Meyer     unsigned char      buffer[poly1305_block_size]; /* 32 bytes */
520ac341f1SConrad Meyer } poly1305_state_internal_t;                        /* 164 bytes total */
530ac341f1SConrad Meyer 
540ac341f1SConrad Meyer /*
550ac341f1SConrad Meyer  * _mm_loadl_epi64() is turned into a simple MOVQ. So, unaligned accesses are
560ac341f1SConrad Meyer  * totally fine, even though this intrinsic requires a __m128i* input.
570ac341f1SConrad Meyer  * This confuses dynamic analysis, so force alignment, only in debug mode.
580ac341f1SConrad Meyer  */
590ac341f1SConrad Meyer # ifdef DEBUG
600ac341f1SConrad Meyer static xmmi
_fakealign_mm_loadl_epi64(const void * m)610ac341f1SConrad Meyer _fakealign_mm_loadl_epi64(const void *m)
620ac341f1SConrad Meyer {
630ac341f1SConrad Meyer     xmmi tmp;
640ac341f1SConrad Meyer     memcpy(&tmp, m, 8);
650ac341f1SConrad Meyer 
660ac341f1SConrad Meyer     return _mm_loadl_epi64(&tmp);
670ac341f1SConrad Meyer }
680ac341f1SConrad Meyer # define _mm_loadl_epi64(X) _fakealign_mm_loadl_epi64(X)
690ac341f1SConrad Meyer #endif
700ac341f1SConrad Meyer 
710ac341f1SConrad Meyer /* copy 0-31 bytes */
720ac341f1SConrad Meyer static inline void
poly1305_block_copy31(unsigned char * dst,const unsigned char * src,unsigned long long bytes)730ac341f1SConrad Meyer poly1305_block_copy31(unsigned char *dst, const unsigned char *src,
740ac341f1SConrad Meyer                       unsigned long long bytes)
750ac341f1SConrad Meyer {
760ac341f1SConrad Meyer     if (bytes & 16) {
770ac341f1SConrad Meyer         _mm_store_si128((xmmi *) (void *) dst,
780ac341f1SConrad Meyer                         _mm_loadu_si128((const xmmi *) (const void *) src));
790ac341f1SConrad Meyer         src += 16;
800ac341f1SConrad Meyer         dst += 16;
810ac341f1SConrad Meyer     }
820ac341f1SConrad Meyer     if (bytes & 8) {
830ac341f1SConrad Meyer         memcpy(dst, src, 8);
840ac341f1SConrad Meyer         src += 8;
850ac341f1SConrad Meyer         dst += 8;
860ac341f1SConrad Meyer     }
870ac341f1SConrad Meyer     if (bytes & 4) {
880ac341f1SConrad Meyer         memcpy(dst, src, 4);
890ac341f1SConrad Meyer         src += 4;
900ac341f1SConrad Meyer         dst += 4;
910ac341f1SConrad Meyer     }
920ac341f1SConrad Meyer     if (bytes & 2) {
930ac341f1SConrad Meyer         memcpy(dst, src, 2);
940ac341f1SConrad Meyer         src += 2;
950ac341f1SConrad Meyer         dst += 2;
960ac341f1SConrad Meyer     }
970ac341f1SConrad Meyer     if (bytes & 1) {
980ac341f1SConrad Meyer         *dst = *src;
990ac341f1SConrad Meyer     }
1000ac341f1SConrad Meyer }
1010ac341f1SConrad Meyer 
1020ac341f1SConrad Meyer static POLY1305_NOINLINE void
poly1305_init_ext(poly1305_state_internal_t * st,const unsigned char key[32],unsigned long long bytes)1030ac341f1SConrad Meyer poly1305_init_ext(poly1305_state_internal_t *st, const unsigned char key[32],
1040ac341f1SConrad Meyer                   unsigned long long bytes)
1050ac341f1SConrad Meyer {
1060ac341f1SConrad Meyer     uint32_t          *R;
1070ac341f1SConrad Meyer     uint128_t          d[3];
1080ac341f1SConrad Meyer     uint64_t           r0, r1, r2;
1090ac341f1SConrad Meyer     uint64_t           rt0, rt1, rt2, st2, c;
1100ac341f1SConrad Meyer     uint64_t           t0, t1;
1110ac341f1SConrad Meyer     unsigned long long i;
1120ac341f1SConrad Meyer 
1130ac341f1SConrad Meyer     if (!bytes) {
1140ac341f1SConrad Meyer         bytes = ~(unsigned long long) 0;
1150ac341f1SConrad Meyer     }
1160ac341f1SConrad Meyer     /* H = 0 */
1170ac341f1SConrad Meyer     _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], _mm_setzero_si128());
1180ac341f1SConrad Meyer     _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], _mm_setzero_si128());
1190ac341f1SConrad Meyer     _mm_storeu_si128((xmmi *) (void *) &st->H.hh[8], _mm_setzero_si128());
1200ac341f1SConrad Meyer 
1210ac341f1SConrad Meyer     /* clamp key */
1220ac341f1SConrad Meyer     memcpy(&t0, key, 8);
1230ac341f1SConrad Meyer     memcpy(&t1, key + 8, 8);
1240ac341f1SConrad Meyer     r0 = t0 & 0xffc0fffffff;
1250ac341f1SConrad Meyer     t0 >>= 44;
1260ac341f1SConrad Meyer     t0 |= t1 << 20;
1270ac341f1SConrad Meyer     r1 = t0 & 0xfffffc0ffff;
1280ac341f1SConrad Meyer     t1 >>= 24;
1290ac341f1SConrad Meyer     r2 = t1 & 0x00ffffffc0f;
1300ac341f1SConrad Meyer 
1310ac341f1SConrad Meyer     /* r^1 */
1320ac341f1SConrad Meyer     R    = st->R;
1330ac341f1SConrad Meyer     R[0] = (uint32_t)(r0) &0x3ffffff;
1340ac341f1SConrad Meyer     R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
1350ac341f1SConrad Meyer     R[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
1360ac341f1SConrad Meyer     R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
1370ac341f1SConrad Meyer     R[4] = (uint32_t)((r2 >> 16));
1380ac341f1SConrad Meyer 
1390ac341f1SConrad Meyer     /* save pad */
1400ac341f1SConrad Meyer     memcpy(&st->pad[0], key + 16, 8);
1410ac341f1SConrad Meyer     memcpy(&st->pad[1], key + 24, 8);
1420ac341f1SConrad Meyer 
1430ac341f1SConrad Meyer     rt0 = r0;
1440ac341f1SConrad Meyer     rt1 = r1;
1450ac341f1SConrad Meyer     rt2 = r2;
1460ac341f1SConrad Meyer 
1470ac341f1SConrad Meyer     /* r^2, r^4 */
1480ac341f1SConrad Meyer     for (i = 0; i < 2; i++) {
1490ac341f1SConrad Meyer         if (i == 0) {
1500ac341f1SConrad Meyer             R = st->R2;
1510ac341f1SConrad Meyer             if (bytes <= 16) {
1520ac341f1SConrad Meyer                 break;
1530ac341f1SConrad Meyer             }
1540ac341f1SConrad Meyer         } else if (i == 1) {
1550ac341f1SConrad Meyer             R = st->R4;
1560ac341f1SConrad Meyer             if (bytes < 96) {
1570ac341f1SConrad Meyer                 break;
1580ac341f1SConrad Meyer             }
1590ac341f1SConrad Meyer         }
1600ac341f1SConrad Meyer         st2 = rt2 * (5 << 2);
1610ac341f1SConrad Meyer 
1620ac341f1SConrad Meyer         d[0] = ((uint128_t) rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2);
1630ac341f1SConrad Meyer         d[1] = ((uint128_t) rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1);
1640ac341f1SConrad Meyer         d[2] = ((uint128_t) rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0);
1650ac341f1SConrad Meyer 
1660ac341f1SConrad Meyer         rt0 = (uint64_t) d[0] & 0xfffffffffff;
1670ac341f1SConrad Meyer         c   = (uint64_t)(d[0] >> 44);
1680ac341f1SConrad Meyer         d[1] += c;
1690ac341f1SConrad Meyer 
1700ac341f1SConrad Meyer         rt1 = (uint64_t) d[1] & 0xfffffffffff;
1710ac341f1SConrad Meyer         c   = (uint64_t)(d[1] >> 44);
1720ac341f1SConrad Meyer         d[2] += c;
1730ac341f1SConrad Meyer 
1740ac341f1SConrad Meyer         rt2 = (uint64_t) d[2] & 0x3ffffffffff;
1750ac341f1SConrad Meyer         c   = (uint64_t)(d[2] >> 42);
1760ac341f1SConrad Meyer         rt0 += c * 5;
1770ac341f1SConrad Meyer         c   = (rt0 >> 44);
1780ac341f1SConrad Meyer         rt0 = rt0 & 0xfffffffffff;
1790ac341f1SConrad Meyer         rt1 += c;
1800ac341f1SConrad Meyer         c   = (rt1 >> 44);
1810ac341f1SConrad Meyer         rt1 = rt1 & 0xfffffffffff;
1820ac341f1SConrad Meyer         rt2 += c; /* even if rt2 overflows, it will still fit in rp4 safely, and
1830ac341f1SConrad Meyer                      is safe to multiply with */
1840ac341f1SConrad Meyer 
1850ac341f1SConrad Meyer         R[0] = (uint32_t)(rt0) &0x3ffffff;
1860ac341f1SConrad Meyer         R[1] = (uint32_t)((rt0 >> 26) | (rt1 << 18)) & 0x3ffffff;
1870ac341f1SConrad Meyer         R[2] = (uint32_t)((rt1 >> 8)) & 0x3ffffff;
1880ac341f1SConrad Meyer         R[3] = (uint32_t)((rt1 >> 34) | (rt2 << 10)) & 0x3ffffff;
1890ac341f1SConrad Meyer         R[4] = (uint32_t)((rt2 >> 16));
1900ac341f1SConrad Meyer     }
1910ac341f1SConrad Meyer     st->flags    = 0;
1920ac341f1SConrad Meyer     st->leftover = 0U;
1930ac341f1SConrad Meyer }
1940ac341f1SConrad Meyer 
1950ac341f1SConrad Meyer static POLY1305_NOINLINE void
poly1305_blocks(poly1305_state_internal_t * st,const unsigned char * m,unsigned long long bytes)1960ac341f1SConrad Meyer poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
1970ac341f1SConrad Meyer                 unsigned long long bytes)
1980ac341f1SConrad Meyer {
1990ac341f1SConrad Meyer     CRYPTO_ALIGN(64)
2000ac341f1SConrad Meyer     xmmi HIBIT =
2010ac341f1SConrad Meyer         _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1, 0, 1, 0));
2020ac341f1SConrad Meyer     const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1),
2030ac341f1SConrad Meyer                                          _MM_SHUFFLE(1, 0, 1, 0));
2040ac341f1SConrad Meyer     const xmmi FIVE =
2050ac341f1SConrad Meyer         _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1, 0, 1, 0));
2060ac341f1SConrad Meyer     xmmi H0, H1, H2, H3, H4;
2070ac341f1SConrad Meyer     xmmi T0, T1, T2, T3, T4, T5, T6, T7, T8;
2080ac341f1SConrad Meyer     xmmi M0, M1, M2, M3, M4;
2090ac341f1SConrad Meyer     xmmi M5, M6, M7, M8;
2100ac341f1SConrad Meyer     xmmi C1, C2;
2110ac341f1SConrad Meyer     xmmi R20, R21, R22, R23, R24, S21, S22, S23, S24;
2120ac341f1SConrad Meyer     xmmi R40, R41, R42, R43, R44, S41, S42, S43, S44;
2130ac341f1SConrad Meyer 
2140ac341f1SConrad Meyer     if (st->flags & poly1305_final_shift8) {
2150ac341f1SConrad Meyer         HIBIT = _mm_srli_si128(HIBIT, 8);
2160ac341f1SConrad Meyer     }
2170ac341f1SConrad Meyer     if (st->flags & poly1305_final_shift16) {
2180ac341f1SConrad Meyer         HIBIT = _mm_setzero_si128();
2190ac341f1SConrad Meyer     }
2200ac341f1SConrad Meyer     if (!(st->flags & poly1305_started)) {
2210ac341f1SConrad Meyer         /* H = [Mx,My] */
2220ac341f1SConrad Meyer         T5 = _mm_unpacklo_epi64(
2230ac341f1SConrad Meyer             _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
2240ac341f1SConrad Meyer             _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
2250ac341f1SConrad Meyer         T6 = _mm_unpacklo_epi64(
2260ac341f1SConrad Meyer             _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
2270ac341f1SConrad Meyer             _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
2280ac341f1SConrad Meyer         H0 = _mm_and_si128(MMASK, T5);
2290ac341f1SConrad Meyer         H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
2300ac341f1SConrad Meyer         T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
2310ac341f1SConrad Meyer         H2 = _mm_and_si128(MMASK, T5);
2320ac341f1SConrad Meyer         H3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
2330ac341f1SConrad Meyer         H4 = _mm_srli_epi64(T6, 40);
2340ac341f1SConrad Meyer         H4 = _mm_or_si128(H4, HIBIT);
2350ac341f1SConrad Meyer         m += 32;
2360ac341f1SConrad Meyer         bytes -= 32;
2370ac341f1SConrad Meyer         st->flags |= poly1305_started;
2380ac341f1SConrad Meyer     } else {
2390ac341f1SConrad Meyer         T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[0]);
2400ac341f1SConrad Meyer         T1 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[4]);
2410ac341f1SConrad Meyer         T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[8]);
2420ac341f1SConrad Meyer         H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 0, 0));
2430ac341f1SConrad Meyer         H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 2, 2));
2440ac341f1SConrad Meyer         H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1, 1, 0, 0));
2450ac341f1SConrad Meyer         H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 3, 2, 2));
2460ac341f1SConrad Meyer         H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1, 1, 0, 0));
2470ac341f1SConrad Meyer     }
2480ac341f1SConrad Meyer     if (st->flags & (poly1305_final_r2_r | poly1305_final_r_1)) {
2490ac341f1SConrad Meyer         if (st->flags & poly1305_final_r2_r) {
2500ac341f1SConrad Meyer             /* use [r^2, r] */
2510ac341f1SConrad Meyer             T2  = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
2520ac341f1SConrad Meyer             T3  = _mm_cvtsi32_si128(st->R[4]);
2530ac341f1SConrad Meyer             T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
2540ac341f1SConrad Meyer             T1  = _mm_cvtsi32_si128(st->R2[4]);
2550ac341f1SConrad Meyer             T4  = _mm_unpacklo_epi32(T0, T2);
2560ac341f1SConrad Meyer             T5  = _mm_unpackhi_epi32(T0, T2);
2570ac341f1SConrad Meyer             R24 = _mm_unpacklo_epi64(T1, T3);
2580ac341f1SConrad Meyer         } else {
2590ac341f1SConrad Meyer             /* use [r^1, 1] */
2600ac341f1SConrad Meyer             T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
2610ac341f1SConrad Meyer             T1  = _mm_cvtsi32_si128(st->R[4]);
2620ac341f1SConrad Meyer             T2  = _mm_cvtsi32_si128(1);
2630ac341f1SConrad Meyer             T4  = _mm_unpacklo_epi32(T0, T2);
2640ac341f1SConrad Meyer             T5  = _mm_unpackhi_epi32(T0, T2);
2650ac341f1SConrad Meyer             R24 = T1;
2660ac341f1SConrad Meyer         }
2670ac341f1SConrad Meyer         R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1, 1, 0, 0));
2680ac341f1SConrad Meyer         R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3, 3, 2, 2));
2690ac341f1SConrad Meyer         R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1, 1, 0, 0));
2700ac341f1SConrad Meyer         R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3, 3, 2, 2));
2710ac341f1SConrad Meyer     } else {
2720ac341f1SConrad Meyer         /* use [r^2, r^2] */
2730ac341f1SConrad Meyer         T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
2740ac341f1SConrad Meyer         T1  = _mm_cvtsi32_si128(st->R2[4]);
2750ac341f1SConrad Meyer         R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
2760ac341f1SConrad Meyer         R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
2770ac341f1SConrad Meyer         R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
2780ac341f1SConrad Meyer         R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
2790ac341f1SConrad Meyer         R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
2800ac341f1SConrad Meyer     }
2810ac341f1SConrad Meyer     S21 = _mm_mul_epu32(R21, FIVE);
2820ac341f1SConrad Meyer     S22 = _mm_mul_epu32(R22, FIVE);
2830ac341f1SConrad Meyer     S23 = _mm_mul_epu32(R23, FIVE);
2840ac341f1SConrad Meyer     S24 = _mm_mul_epu32(R24, FIVE);
2850ac341f1SConrad Meyer 
2860ac341f1SConrad Meyer     if (bytes >= 64) {
2870ac341f1SConrad Meyer         T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R4[0]);
2880ac341f1SConrad Meyer         T1  = _mm_cvtsi32_si128(st->R4[4]);
2890ac341f1SConrad Meyer         R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
2900ac341f1SConrad Meyer         R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
2910ac341f1SConrad Meyer         R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
2920ac341f1SConrad Meyer         R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
2930ac341f1SConrad Meyer         R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
2940ac341f1SConrad Meyer         S41 = _mm_mul_epu32(R41, FIVE);
2950ac341f1SConrad Meyer         S42 = _mm_mul_epu32(R42, FIVE);
2960ac341f1SConrad Meyer         S43 = _mm_mul_epu32(R43, FIVE);
2970ac341f1SConrad Meyer         S44 = _mm_mul_epu32(R44, FIVE);
2980ac341f1SConrad Meyer 
2990ac341f1SConrad Meyer         while (bytes >= 64) {
3000ac341f1SConrad Meyer             xmmi v00, v01, v02, v03, v04;
3010ac341f1SConrad Meyer             xmmi v10, v11, v12, v13, v14;
3020ac341f1SConrad Meyer             xmmi v20, v21, v22, v23, v24;
3030ac341f1SConrad Meyer             xmmi v30, v31, v32, v33, v34;
3040ac341f1SConrad Meyer             xmmi v40, v41, v42, v43, v44;
3050ac341f1SConrad Meyer             xmmi T14, T15;
3060ac341f1SConrad Meyer 
3070ac341f1SConrad Meyer             /* H *= [r^4,r^4], preload [Mx,My] */
3080ac341f1SConrad Meyer             T15 = S42;
3090ac341f1SConrad Meyer             T0  = H4;
3100ac341f1SConrad Meyer             T0  = _mm_mul_epu32(T0, S41);
3110ac341f1SConrad Meyer             v01 = H3;
3120ac341f1SConrad Meyer             v01 = _mm_mul_epu32(v01, T15);
3130ac341f1SConrad Meyer             T14 = S43;
3140ac341f1SConrad Meyer             T1  = H4;
3150ac341f1SConrad Meyer             T1  = _mm_mul_epu32(T1, T15);
3160ac341f1SConrad Meyer             v11 = H3;
3170ac341f1SConrad Meyer             v11 = _mm_mul_epu32(v11, T14);
3180ac341f1SConrad Meyer             T2  = H4;
3190ac341f1SConrad Meyer             T2  = _mm_mul_epu32(T2, T14);
3200ac341f1SConrad Meyer             T0  = _mm_add_epi64(T0, v01);
3210ac341f1SConrad Meyer             T15 = S44;
3220ac341f1SConrad Meyer             v02 = H2;
3230ac341f1SConrad Meyer             v02 = _mm_mul_epu32(v02, T14);
3240ac341f1SConrad Meyer             T3  = H4;
3250ac341f1SConrad Meyer             T3  = _mm_mul_epu32(T3, T15);
3260ac341f1SConrad Meyer             T1  = _mm_add_epi64(T1, v11);
3270ac341f1SConrad Meyer             v03 = H1;
3280ac341f1SConrad Meyer             v03 = _mm_mul_epu32(v03, T15);
3290ac341f1SConrad Meyer             v12 = H2;
3300ac341f1SConrad Meyer             v12 = _mm_mul_epu32(v12, T15);
3310ac341f1SConrad Meyer             T0  = _mm_add_epi64(T0, v02);
3320ac341f1SConrad Meyer             T14 = R40;
3330ac341f1SConrad Meyer             v21 = H3;
3340ac341f1SConrad Meyer             v21 = _mm_mul_epu32(v21, T15);
3350ac341f1SConrad Meyer             v31 = H3;
3360ac341f1SConrad Meyer             v31 = _mm_mul_epu32(v31, T14);
3370ac341f1SConrad Meyer             T0  = _mm_add_epi64(T0, v03);
3380ac341f1SConrad Meyer             T4  = H4;
3390ac341f1SConrad Meyer             T4  = _mm_mul_epu32(T4, T14);
3400ac341f1SConrad Meyer             T1  = _mm_add_epi64(T1, v12);
3410ac341f1SConrad Meyer             v04 = H0;
3420ac341f1SConrad Meyer             v04 = _mm_mul_epu32(v04, T14);
3430ac341f1SConrad Meyer             T2  = _mm_add_epi64(T2, v21);
3440ac341f1SConrad Meyer             v13 = H1;
3450ac341f1SConrad Meyer             v13 = _mm_mul_epu32(v13, T14);
3460ac341f1SConrad Meyer             T3  = _mm_add_epi64(T3, v31);
3470ac341f1SConrad Meyer             T15 = R41;
3480ac341f1SConrad Meyer             v22 = H2;
3490ac341f1SConrad Meyer             v22 = _mm_mul_epu32(v22, T14);
3500ac341f1SConrad Meyer             v32 = H2;
3510ac341f1SConrad Meyer             v32 = _mm_mul_epu32(v32, T15);
3520ac341f1SConrad Meyer             T0  = _mm_add_epi64(T0, v04);
3530ac341f1SConrad Meyer             v41 = H3;
3540ac341f1SConrad Meyer             v41 = _mm_mul_epu32(v41, T15);
3550ac341f1SConrad Meyer             T1  = _mm_add_epi64(T1, v13);
3560ac341f1SConrad Meyer             v14 = H0;
3570ac341f1SConrad Meyer             v14 = _mm_mul_epu32(v14, T15);
3580ac341f1SConrad Meyer             T2  = _mm_add_epi64(T2, v22);
3590ac341f1SConrad Meyer             T14 = R42;
3600ac341f1SConrad Meyer             T5  = _mm_unpacklo_epi64(
3610ac341f1SConrad Meyer                 _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
3620ac341f1SConrad Meyer                 _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
3630ac341f1SConrad Meyer             v23 = H1;
3640ac341f1SConrad Meyer             v23 = _mm_mul_epu32(v23, T15);
3650ac341f1SConrad Meyer             T3  = _mm_add_epi64(T3, v32);
3660ac341f1SConrad Meyer             v33 = H1;
3670ac341f1SConrad Meyer             v33 = _mm_mul_epu32(v33, T14);
3680ac341f1SConrad Meyer             T4  = _mm_add_epi64(T4, v41);
3690ac341f1SConrad Meyer             v42 = H2;
3700ac341f1SConrad Meyer             v42 = _mm_mul_epu32(v42, T14);
3710ac341f1SConrad Meyer             T1  = _mm_add_epi64(T1, v14);
3720ac341f1SConrad Meyer             T15 = R43;
3730ac341f1SConrad Meyer             T6  = _mm_unpacklo_epi64(
3740ac341f1SConrad Meyer                 _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
3750ac341f1SConrad Meyer                 _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
3760ac341f1SConrad Meyer             v24 = H0;
3770ac341f1SConrad Meyer             v24 = _mm_mul_epu32(v24, T14);
3780ac341f1SConrad Meyer             T2  = _mm_add_epi64(T2, v23);
3790ac341f1SConrad Meyer             v34 = H0;
3800ac341f1SConrad Meyer             v34 = _mm_mul_epu32(v34, T15);
3810ac341f1SConrad Meyer             T3  = _mm_add_epi64(T3, v33);
3820ac341f1SConrad Meyer             M0  = _mm_and_si128(MMASK, T5);
3830ac341f1SConrad Meyer             v43 = H1;
3840ac341f1SConrad Meyer             v43 = _mm_mul_epu32(v43, T15);
3850ac341f1SConrad Meyer             T4  = _mm_add_epi64(T4, v42);
3860ac341f1SConrad Meyer             M1  = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
3870ac341f1SConrad Meyer             v44 = H0;
3880ac341f1SConrad Meyer             v44 = _mm_mul_epu32(v44, R44);
3890ac341f1SConrad Meyer             T2  = _mm_add_epi64(T2, v24);
3900ac341f1SConrad Meyer             T5  = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
3910ac341f1SConrad Meyer             T3  = _mm_add_epi64(T3, v34);
3920ac341f1SConrad Meyer             M3  = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14));
3930ac341f1SConrad Meyer             T4  = _mm_add_epi64(T4, v43);
3940ac341f1SConrad Meyer             M2  = _mm_and_si128(MMASK, T5);
3950ac341f1SConrad Meyer             T4  = _mm_add_epi64(T4, v44);
3960ac341f1SConrad Meyer             M4  = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
3970ac341f1SConrad Meyer 
3980ac341f1SConrad Meyer             /* H += [Mx',My'] */
3990ac341f1SConrad Meyer             T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 32));
4000ac341f1SConrad Meyer             T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 48));
4010ac341f1SConrad Meyer             T7 = _mm_unpacklo_epi32(T5, T6);
4020ac341f1SConrad Meyer             T8 = _mm_unpackhi_epi32(T5, T6);
4030ac341f1SConrad Meyer             M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
4040ac341f1SConrad Meyer             M6 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
4050ac341f1SConrad Meyer             M7 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
4060ac341f1SConrad Meyer             M8 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
4070ac341f1SConrad Meyer             M6 = _mm_slli_epi64(M6, 6);
4080ac341f1SConrad Meyer             M7 = _mm_slli_epi64(M7, 12);
4090ac341f1SConrad Meyer             M8 = _mm_slli_epi64(M8, 18);
4100ac341f1SConrad Meyer             T0 = _mm_add_epi64(T0, M5);
4110ac341f1SConrad Meyer             T1 = _mm_add_epi64(T1, M6);
4120ac341f1SConrad Meyer             T2 = _mm_add_epi64(T2, M7);
4130ac341f1SConrad Meyer             T3 = _mm_add_epi64(T3, M8);
4140ac341f1SConrad Meyer             T4 = _mm_add_epi64(T4, HIBIT);
4150ac341f1SConrad Meyer 
4160ac341f1SConrad Meyer             /* H += [Mx,My]*[r^2,r^2] */
4170ac341f1SConrad Meyer             T15 = S22;
4180ac341f1SConrad Meyer             v00 = M4;
4190ac341f1SConrad Meyer             v00 = _mm_mul_epu32(v00, S21);
4200ac341f1SConrad Meyer             v01 = M3;
4210ac341f1SConrad Meyer             v01 = _mm_mul_epu32(v01, T15);
4220ac341f1SConrad Meyer             T14 = S23;
4230ac341f1SConrad Meyer             v10 = M4;
4240ac341f1SConrad Meyer             v10 = _mm_mul_epu32(v10, T15);
4250ac341f1SConrad Meyer             v11 = M3;
4260ac341f1SConrad Meyer             v11 = _mm_mul_epu32(v11, T14);
4270ac341f1SConrad Meyer             T0  = _mm_add_epi64(T0, v00);
4280ac341f1SConrad Meyer             v20 = M4;
4290ac341f1SConrad Meyer             v20 = _mm_mul_epu32(v20, T14);
4300ac341f1SConrad Meyer             T0  = _mm_add_epi64(T0, v01);
4310ac341f1SConrad Meyer             T15 = S24;
4320ac341f1SConrad Meyer             v02 = M2;
4330ac341f1SConrad Meyer             v02 = _mm_mul_epu32(v02, T14);
4340ac341f1SConrad Meyer             T1  = _mm_add_epi64(T1, v10);
4350ac341f1SConrad Meyer             v30 = M4;
4360ac341f1SConrad Meyer             v30 = _mm_mul_epu32(v30, T15);
4370ac341f1SConrad Meyer             T1  = _mm_add_epi64(T1, v11);
4380ac341f1SConrad Meyer             v03 = M1;
4390ac341f1SConrad Meyer             v03 = _mm_mul_epu32(v03, T15);
4400ac341f1SConrad Meyer             T2  = _mm_add_epi64(T2, v20);
4410ac341f1SConrad Meyer             v12 = M2;
4420ac341f1SConrad Meyer             v12 = _mm_mul_epu32(v12, T15);
4430ac341f1SConrad Meyer             T0  = _mm_add_epi64(T0, v02);
4440ac341f1SConrad Meyer             T14 = R20;
4450ac341f1SConrad Meyer             v21 = M3;
4460ac341f1SConrad Meyer             v21 = _mm_mul_epu32(v21, T15);
4470ac341f1SConrad Meyer             T3  = _mm_add_epi64(T3, v30);
4480ac341f1SConrad Meyer             v31 = M3;
4490ac341f1SConrad Meyer             v31 = _mm_mul_epu32(v31, T14);
4500ac341f1SConrad Meyer             T0  = _mm_add_epi64(T0, v03);
4510ac341f1SConrad Meyer             v40 = M4;
4520ac341f1SConrad Meyer             v40 = _mm_mul_epu32(v40, T14);
4530ac341f1SConrad Meyer             T1  = _mm_add_epi64(T1, v12);
4540ac341f1SConrad Meyer             v04 = M0;
4550ac341f1SConrad Meyer             v04 = _mm_mul_epu32(v04, T14);
4560ac341f1SConrad Meyer             T2  = _mm_add_epi64(T2, v21);
4570ac341f1SConrad Meyer             v13 = M1;
4580ac341f1SConrad Meyer             v13 = _mm_mul_epu32(v13, T14);
4590ac341f1SConrad Meyer             T3  = _mm_add_epi64(T3, v31);
4600ac341f1SConrad Meyer             T15 = R21;
4610ac341f1SConrad Meyer             v22 = M2;
4620ac341f1SConrad Meyer             v22 = _mm_mul_epu32(v22, T14);
4630ac341f1SConrad Meyer             T4  = _mm_add_epi64(T4, v40);
4640ac341f1SConrad Meyer             v32 = M2;
4650ac341f1SConrad Meyer             v32 = _mm_mul_epu32(v32, T15);
4660ac341f1SConrad Meyer             T0  = _mm_add_epi64(T0, v04);
4670ac341f1SConrad Meyer             v41 = M3;
4680ac341f1SConrad Meyer             v41 = _mm_mul_epu32(v41, T15);
4690ac341f1SConrad Meyer             T1  = _mm_add_epi64(T1, v13);
4700ac341f1SConrad Meyer             v14 = M0;
4710ac341f1SConrad Meyer             v14 = _mm_mul_epu32(v14, T15);
4720ac341f1SConrad Meyer             T2  = _mm_add_epi64(T2, v22);
4730ac341f1SConrad Meyer             T14 = R22;
4740ac341f1SConrad Meyer             v23 = M1;
4750ac341f1SConrad Meyer             v23 = _mm_mul_epu32(v23, T15);
4760ac341f1SConrad Meyer             T3  = _mm_add_epi64(T3, v32);
4770ac341f1SConrad Meyer             v33 = M1;
4780ac341f1SConrad Meyer             v33 = _mm_mul_epu32(v33, T14);
4790ac341f1SConrad Meyer             T4  = _mm_add_epi64(T4, v41);
4800ac341f1SConrad Meyer             v42 = M2;
4810ac341f1SConrad Meyer             v42 = _mm_mul_epu32(v42, T14);
4820ac341f1SConrad Meyer             T1  = _mm_add_epi64(T1, v14);
4830ac341f1SConrad Meyer             T15 = R23;
4840ac341f1SConrad Meyer             v24 = M0;
4850ac341f1SConrad Meyer             v24 = _mm_mul_epu32(v24, T14);
4860ac341f1SConrad Meyer             T2  = _mm_add_epi64(T2, v23);
4870ac341f1SConrad Meyer             v34 = M0;
4880ac341f1SConrad Meyer             v34 = _mm_mul_epu32(v34, T15);
4890ac341f1SConrad Meyer             T3  = _mm_add_epi64(T3, v33);
4900ac341f1SConrad Meyer             v43 = M1;
4910ac341f1SConrad Meyer             v43 = _mm_mul_epu32(v43, T15);
4920ac341f1SConrad Meyer             T4  = _mm_add_epi64(T4, v42);
4930ac341f1SConrad Meyer             v44 = M0;
4940ac341f1SConrad Meyer             v44 = _mm_mul_epu32(v44, R24);
4950ac341f1SConrad Meyer             T2  = _mm_add_epi64(T2, v24);
4960ac341f1SConrad Meyer             T3  = _mm_add_epi64(T3, v34);
4970ac341f1SConrad Meyer             T4  = _mm_add_epi64(T4, v43);
4980ac341f1SConrad Meyer             T4  = _mm_add_epi64(T4, v44);
4990ac341f1SConrad Meyer 
5000ac341f1SConrad Meyer             /* reduce */
5010ac341f1SConrad Meyer             C1 = _mm_srli_epi64(T0, 26);
5020ac341f1SConrad Meyer             C2 = _mm_srli_epi64(T3, 26);
5030ac341f1SConrad Meyer             T0 = _mm_and_si128(T0, MMASK);
5040ac341f1SConrad Meyer             T3 = _mm_and_si128(T3, MMASK);
5050ac341f1SConrad Meyer             T1 = _mm_add_epi64(T1, C1);
5060ac341f1SConrad Meyer             T4 = _mm_add_epi64(T4, C2);
5070ac341f1SConrad Meyer             C1 = _mm_srli_epi64(T1, 26);
5080ac341f1SConrad Meyer             C2 = _mm_srli_epi64(T4, 26);
5090ac341f1SConrad Meyer             T1 = _mm_and_si128(T1, MMASK);
5100ac341f1SConrad Meyer             T4 = _mm_and_si128(T4, MMASK);
5110ac341f1SConrad Meyer             T2 = _mm_add_epi64(T2, C1);
5120ac341f1SConrad Meyer             T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
5130ac341f1SConrad Meyer             C1 = _mm_srli_epi64(T2, 26);
5140ac341f1SConrad Meyer             C2 = _mm_srli_epi64(T0, 26);
5150ac341f1SConrad Meyer             T2 = _mm_and_si128(T2, MMASK);
5160ac341f1SConrad Meyer             T0 = _mm_and_si128(T0, MMASK);
5170ac341f1SConrad Meyer             T3 = _mm_add_epi64(T3, C1);
5180ac341f1SConrad Meyer             T1 = _mm_add_epi64(T1, C2);
5190ac341f1SConrad Meyer             C1 = _mm_srli_epi64(T3, 26);
5200ac341f1SConrad Meyer             T3 = _mm_and_si128(T3, MMASK);
5210ac341f1SConrad Meyer             T4 = _mm_add_epi64(T4, C1);
5220ac341f1SConrad Meyer 
5230ac341f1SConrad Meyer             /* Final: H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx',My']) */
5240ac341f1SConrad Meyer             H0 = T0;
5250ac341f1SConrad Meyer             H1 = T1;
5260ac341f1SConrad Meyer             H2 = T2;
5270ac341f1SConrad Meyer             H3 = T3;
5280ac341f1SConrad Meyer             H4 = T4;
5290ac341f1SConrad Meyer 
5300ac341f1SConrad Meyer             m += 64;
5310ac341f1SConrad Meyer             bytes -= 64;
5320ac341f1SConrad Meyer         }
5330ac341f1SConrad Meyer     }
5340ac341f1SConrad Meyer 
5350ac341f1SConrad Meyer     if (bytes >= 32) {
5360ac341f1SConrad Meyer         xmmi v01, v02, v03, v04;
5370ac341f1SConrad Meyer         xmmi v11, v12, v13, v14;
5380ac341f1SConrad Meyer         xmmi v21, v22, v23, v24;
5390ac341f1SConrad Meyer         xmmi v31, v32, v33, v34;
5400ac341f1SConrad Meyer         xmmi v41, v42, v43, v44;
5410ac341f1SConrad Meyer         xmmi T14, T15;
5420ac341f1SConrad Meyer 
5430ac341f1SConrad Meyer         /* H *= [r^2,r^2] */
5440ac341f1SConrad Meyer         T15 = S22;
5450ac341f1SConrad Meyer         T0  = H4;
5460ac341f1SConrad Meyer         T0  = _mm_mul_epu32(T0, S21);
5470ac341f1SConrad Meyer         v01 = H3;
5480ac341f1SConrad Meyer         v01 = _mm_mul_epu32(v01, T15);
5490ac341f1SConrad Meyer         T14 = S23;
5500ac341f1SConrad Meyer         T1  = H4;
5510ac341f1SConrad Meyer         T1  = _mm_mul_epu32(T1, T15);
5520ac341f1SConrad Meyer         v11 = H3;
5530ac341f1SConrad Meyer         v11 = _mm_mul_epu32(v11, T14);
5540ac341f1SConrad Meyer         T2  = H4;
5550ac341f1SConrad Meyer         T2  = _mm_mul_epu32(T2, T14);
5560ac341f1SConrad Meyer         T0  = _mm_add_epi64(T0, v01);
5570ac341f1SConrad Meyer         T15 = S24;
5580ac341f1SConrad Meyer         v02 = H2;
5590ac341f1SConrad Meyer         v02 = _mm_mul_epu32(v02, T14);
5600ac341f1SConrad Meyer         T3  = H4;
5610ac341f1SConrad Meyer         T3  = _mm_mul_epu32(T3, T15);
5620ac341f1SConrad Meyer         T1  = _mm_add_epi64(T1, v11);
5630ac341f1SConrad Meyer         v03 = H1;
5640ac341f1SConrad Meyer         v03 = _mm_mul_epu32(v03, T15);
5650ac341f1SConrad Meyer         v12 = H2;
5660ac341f1SConrad Meyer         v12 = _mm_mul_epu32(v12, T15);
5670ac341f1SConrad Meyer         T0  = _mm_add_epi64(T0, v02);
5680ac341f1SConrad Meyer         T14 = R20;
5690ac341f1SConrad Meyer         v21 = H3;
5700ac341f1SConrad Meyer         v21 = _mm_mul_epu32(v21, T15);
5710ac341f1SConrad Meyer         v31 = H3;
5720ac341f1SConrad Meyer         v31 = _mm_mul_epu32(v31, T14);
5730ac341f1SConrad Meyer         T0  = _mm_add_epi64(T0, v03);
5740ac341f1SConrad Meyer         T4  = H4;
5750ac341f1SConrad Meyer         T4  = _mm_mul_epu32(T4, T14);
5760ac341f1SConrad Meyer         T1  = _mm_add_epi64(T1, v12);
5770ac341f1SConrad Meyer         v04 = H0;
5780ac341f1SConrad Meyer         v04 = _mm_mul_epu32(v04, T14);
5790ac341f1SConrad Meyer         T2  = _mm_add_epi64(T2, v21);
5800ac341f1SConrad Meyer         v13 = H1;
5810ac341f1SConrad Meyer         v13 = _mm_mul_epu32(v13, T14);
5820ac341f1SConrad Meyer         T3  = _mm_add_epi64(T3, v31);
5830ac341f1SConrad Meyer         T15 = R21;
5840ac341f1SConrad Meyer         v22 = H2;
5850ac341f1SConrad Meyer         v22 = _mm_mul_epu32(v22, T14);
5860ac341f1SConrad Meyer         v32 = H2;
5870ac341f1SConrad Meyer         v32 = _mm_mul_epu32(v32, T15);
5880ac341f1SConrad Meyer         T0  = _mm_add_epi64(T0, v04);
5890ac341f1SConrad Meyer         v41 = H3;
5900ac341f1SConrad Meyer         v41 = _mm_mul_epu32(v41, T15);
5910ac341f1SConrad Meyer         T1  = _mm_add_epi64(T1, v13);
5920ac341f1SConrad Meyer         v14 = H0;
5930ac341f1SConrad Meyer         v14 = _mm_mul_epu32(v14, T15);
5940ac341f1SConrad Meyer         T2  = _mm_add_epi64(T2, v22);
5950ac341f1SConrad Meyer         T14 = R22;
5960ac341f1SConrad Meyer         v23 = H1;
5970ac341f1SConrad Meyer         v23 = _mm_mul_epu32(v23, T15);
5980ac341f1SConrad Meyer         T3  = _mm_add_epi64(T3, v32);
5990ac341f1SConrad Meyer         v33 = H1;
6000ac341f1SConrad Meyer         v33 = _mm_mul_epu32(v33, T14);
6010ac341f1SConrad Meyer         T4  = _mm_add_epi64(T4, v41);
6020ac341f1SConrad Meyer         v42 = H2;
6030ac341f1SConrad Meyer         v42 = _mm_mul_epu32(v42, T14);
6040ac341f1SConrad Meyer         T1  = _mm_add_epi64(T1, v14);
6050ac341f1SConrad Meyer         T15 = R23;
6060ac341f1SConrad Meyer         v24 = H0;
6070ac341f1SConrad Meyer         v24 = _mm_mul_epu32(v24, T14);
6080ac341f1SConrad Meyer         T2  = _mm_add_epi64(T2, v23);
6090ac341f1SConrad Meyer         v34 = H0;
6100ac341f1SConrad Meyer         v34 = _mm_mul_epu32(v34, T15);
6110ac341f1SConrad Meyer         T3  = _mm_add_epi64(T3, v33);
6120ac341f1SConrad Meyer         v43 = H1;
6130ac341f1SConrad Meyer         v43 = _mm_mul_epu32(v43, T15);
6140ac341f1SConrad Meyer         T4  = _mm_add_epi64(T4, v42);
6150ac341f1SConrad Meyer         v44 = H0;
6160ac341f1SConrad Meyer         v44 = _mm_mul_epu32(v44, R24);
6170ac341f1SConrad Meyer         T2  = _mm_add_epi64(T2, v24);
6180ac341f1SConrad Meyer         T3  = _mm_add_epi64(T3, v34);
6190ac341f1SConrad Meyer         T4  = _mm_add_epi64(T4, v43);
6200ac341f1SConrad Meyer         T4  = _mm_add_epi64(T4, v44);
6210ac341f1SConrad Meyer 
6220ac341f1SConrad Meyer         /* H += [Mx,My] */
6230ac341f1SConrad Meyer         if (m) {
6240ac341f1SConrad Meyer             T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 0));
6250ac341f1SConrad Meyer             T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 16));
6260ac341f1SConrad Meyer             T7 = _mm_unpacklo_epi32(T5, T6);
6270ac341f1SConrad Meyer             T8 = _mm_unpackhi_epi32(T5, T6);
6280ac341f1SConrad Meyer             M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
6290ac341f1SConrad Meyer             M1 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
6300ac341f1SConrad Meyer             M2 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
6310ac341f1SConrad Meyer             M3 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
6320ac341f1SConrad Meyer             M1 = _mm_slli_epi64(M1, 6);
6330ac341f1SConrad Meyer             M2 = _mm_slli_epi64(M2, 12);
6340ac341f1SConrad Meyer             M3 = _mm_slli_epi64(M3, 18);
6350ac341f1SConrad Meyer             T0 = _mm_add_epi64(T0, M0);
6360ac341f1SConrad Meyer             T1 = _mm_add_epi64(T1, M1);
6370ac341f1SConrad Meyer             T2 = _mm_add_epi64(T2, M2);
6380ac341f1SConrad Meyer             T3 = _mm_add_epi64(T3, M3);
6390ac341f1SConrad Meyer             T4 = _mm_add_epi64(T4, HIBIT);
6400ac341f1SConrad Meyer         }
6410ac341f1SConrad Meyer 
6420ac341f1SConrad Meyer         /* reduce */
6430ac341f1SConrad Meyer         C1 = _mm_srli_epi64(T0, 26);
6440ac341f1SConrad Meyer         C2 = _mm_srli_epi64(T3, 26);
6450ac341f1SConrad Meyer         T0 = _mm_and_si128(T0, MMASK);
6460ac341f1SConrad Meyer         T3 = _mm_and_si128(T3, MMASK);
6470ac341f1SConrad Meyer         T1 = _mm_add_epi64(T1, C1);
6480ac341f1SConrad Meyer         T4 = _mm_add_epi64(T4, C2);
6490ac341f1SConrad Meyer         C1 = _mm_srli_epi64(T1, 26);
6500ac341f1SConrad Meyer         C2 = _mm_srli_epi64(T4, 26);
6510ac341f1SConrad Meyer         T1 = _mm_and_si128(T1, MMASK);
6520ac341f1SConrad Meyer         T4 = _mm_and_si128(T4, MMASK);
6530ac341f1SConrad Meyer         T2 = _mm_add_epi64(T2, C1);
6540ac341f1SConrad Meyer         T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
6550ac341f1SConrad Meyer         C1 = _mm_srli_epi64(T2, 26);
6560ac341f1SConrad Meyer         C2 = _mm_srli_epi64(T0, 26);
6570ac341f1SConrad Meyer         T2 = _mm_and_si128(T2, MMASK);
6580ac341f1SConrad Meyer         T0 = _mm_and_si128(T0, MMASK);
6590ac341f1SConrad Meyer         T3 = _mm_add_epi64(T3, C1);
6600ac341f1SConrad Meyer         T1 = _mm_add_epi64(T1, C2);
6610ac341f1SConrad Meyer         C1 = _mm_srli_epi64(T3, 26);
6620ac341f1SConrad Meyer         T3 = _mm_and_si128(T3, MMASK);
6630ac341f1SConrad Meyer         T4 = _mm_add_epi64(T4, C1);
6640ac341f1SConrad Meyer 
6650ac341f1SConrad Meyer         /* H = (H*[r^2,r^2] + [Mx,My]) */
6660ac341f1SConrad Meyer         H0 = T0;
6670ac341f1SConrad Meyer         H1 = T1;
6680ac341f1SConrad Meyer         H2 = T2;
6690ac341f1SConrad Meyer         H3 = T3;
6700ac341f1SConrad Meyer         H4 = T4;
6710ac341f1SConrad Meyer     }
6720ac341f1SConrad Meyer 
6730ac341f1SConrad Meyer     if (m) {
6740ac341f1SConrad Meyer         T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0, 0, 2, 0));
6750ac341f1SConrad Meyer         T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0, 0, 2, 0));
6760ac341f1SConrad Meyer         T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0, 0, 2, 0));
6770ac341f1SConrad Meyer         T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0, 0, 2, 0));
6780ac341f1SConrad Meyer         T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0, 0, 2, 0));
6790ac341f1SConrad Meyer         T0 = _mm_unpacklo_epi64(T0, T1);
6800ac341f1SConrad Meyer         T1 = _mm_unpacklo_epi64(T2, T3);
6810ac341f1SConrad Meyer         _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], T0);
6820ac341f1SConrad Meyer         _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], T1);
6830ac341f1SConrad Meyer         _mm_storel_epi64((xmmi *) (void *) &st->H.hh[8], T4);
6840ac341f1SConrad Meyer     } else {
6850ac341f1SConrad Meyer         uint32_t t0, t1, t2, t3, t4, b;
6860ac341f1SConrad Meyer         uint64_t h0, h1, h2, g0, g1, g2, c, nc;
6870ac341f1SConrad Meyer 
6880ac341f1SConrad Meyer         /* H = H[0]+H[1] */
6890ac341f1SConrad Meyer         T0 = H0;
6900ac341f1SConrad Meyer         T1 = H1;
6910ac341f1SConrad Meyer         T2 = H2;
6920ac341f1SConrad Meyer         T3 = H3;
6930ac341f1SConrad Meyer         T4 = H4;
6940ac341f1SConrad Meyer 
6950ac341f1SConrad Meyer         T0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
6960ac341f1SConrad Meyer         T1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
6970ac341f1SConrad Meyer         T2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
6980ac341f1SConrad Meyer         T3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
6990ac341f1SConrad Meyer         T4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
7000ac341f1SConrad Meyer 
7010ac341f1SConrad Meyer         t0 = _mm_cvtsi128_si32(T0);
7020ac341f1SConrad Meyer         b  = (t0 >> 26);
7030ac341f1SConrad Meyer         t0 &= 0x3ffffff;
7040ac341f1SConrad Meyer         t1 = _mm_cvtsi128_si32(T1) + b;
7050ac341f1SConrad Meyer         b  = (t1 >> 26);
7060ac341f1SConrad Meyer         t1 &= 0x3ffffff;
7070ac341f1SConrad Meyer         t2 = _mm_cvtsi128_si32(T2) + b;
7080ac341f1SConrad Meyer         b  = (t2 >> 26);
7090ac341f1SConrad Meyer         t2 &= 0x3ffffff;
7100ac341f1SConrad Meyer         t3 = _mm_cvtsi128_si32(T3) + b;
7110ac341f1SConrad Meyer         b  = (t3 >> 26);
7120ac341f1SConrad Meyer         t3 &= 0x3ffffff;
7130ac341f1SConrad Meyer         t4 = _mm_cvtsi128_si32(T4) + b;
7140ac341f1SConrad Meyer 
7150ac341f1SConrad Meyer         /* everything except t4 is in range, so this is all safe */
7160ac341f1SConrad Meyer         h0 = (((uint64_t) t0) | ((uint64_t) t1 << 26)) & 0xfffffffffffull;
7170ac341f1SConrad Meyer         h1 = (((uint64_t) t1 >> 18) | ((uint64_t) t2 << 8) |
7180ac341f1SConrad Meyer               ((uint64_t) t3 << 34)) &
7190ac341f1SConrad Meyer              0xfffffffffffull;
7200ac341f1SConrad Meyer         h2 = (((uint64_t) t3 >> 10) | ((uint64_t) t4 << 16));
7210ac341f1SConrad Meyer 
7220ac341f1SConrad Meyer         c = (h2 >> 42);
7230ac341f1SConrad Meyer         h2 &= 0x3ffffffffff;
7240ac341f1SConrad Meyer         h0 += c * 5;
7250ac341f1SConrad Meyer         c = (h0 >> 44);
7260ac341f1SConrad Meyer         h0 &= 0xfffffffffff;
7270ac341f1SConrad Meyer         h1 += c;
7280ac341f1SConrad Meyer         c = (h1 >> 44);
7290ac341f1SConrad Meyer         h1 &= 0xfffffffffff;
7300ac341f1SConrad Meyer         h2 += c;
7310ac341f1SConrad Meyer         c = (h2 >> 42);
7320ac341f1SConrad Meyer         h2 &= 0x3ffffffffff;
7330ac341f1SConrad Meyer         h0 += c * 5;
7340ac341f1SConrad Meyer         c = (h0 >> 44);
7350ac341f1SConrad Meyer         h0 &= 0xfffffffffff;
7360ac341f1SConrad Meyer         h1 += c;
7370ac341f1SConrad Meyer 
7380ac341f1SConrad Meyer         g0 = h0 + 5;
7390ac341f1SConrad Meyer         c  = (g0 >> 44);
7400ac341f1SConrad Meyer         g0 &= 0xfffffffffff;
7410ac341f1SConrad Meyer         g1 = h1 + c;
7420ac341f1SConrad Meyer         c  = (g1 >> 44);
7430ac341f1SConrad Meyer         g1 &= 0xfffffffffff;
7440ac341f1SConrad Meyer         g2 = h2 + c - ((uint64_t) 1 << 42);
7450ac341f1SConrad Meyer 
7460ac341f1SConrad Meyer         c  = (g2 >> 63) - 1;
7470ac341f1SConrad Meyer         nc = ~c;
7480ac341f1SConrad Meyer         h0 = (h0 & nc) | (g0 & c);
7490ac341f1SConrad Meyer         h1 = (h1 & nc) | (g1 & c);
7500ac341f1SConrad Meyer         h2 = (h2 & nc) | (g2 & c);
7510ac341f1SConrad Meyer 
7520ac341f1SConrad Meyer         st->H.h[0] = h0;
7530ac341f1SConrad Meyer         st->H.h[1] = h1;
7540ac341f1SConrad Meyer         st->H.h[2] = h2;
7550ac341f1SConrad Meyer     }
7560ac341f1SConrad Meyer }
7570ac341f1SConrad Meyer 
7580ac341f1SConrad Meyer static void
poly1305_update(poly1305_state_internal_t * st,const unsigned char * m,unsigned long long bytes)7590ac341f1SConrad Meyer poly1305_update(poly1305_state_internal_t *st, const unsigned char *m,
7600ac341f1SConrad Meyer                 unsigned long long bytes)
7610ac341f1SConrad Meyer {
7620ac341f1SConrad Meyer     unsigned long long i;
7630ac341f1SConrad Meyer 
7640ac341f1SConrad Meyer     /* handle leftover */
7650ac341f1SConrad Meyer     if (st->leftover) {
7660ac341f1SConrad Meyer         unsigned long long want = (poly1305_block_size - st->leftover);
7670ac341f1SConrad Meyer 
7680ac341f1SConrad Meyer         if (want > bytes) {
7690ac341f1SConrad Meyer             want = bytes;
7700ac341f1SConrad Meyer         }
7710ac341f1SConrad Meyer         for (i = 0; i < want; i++) {
7720ac341f1SConrad Meyer             st->buffer[st->leftover + i] = m[i];
7730ac341f1SConrad Meyer         }
7740ac341f1SConrad Meyer         bytes -= want;
7750ac341f1SConrad Meyer         m += want;
7760ac341f1SConrad Meyer         st->leftover += want;
7770ac341f1SConrad Meyer         if (st->leftover < poly1305_block_size) {
7780ac341f1SConrad Meyer             return;
7790ac341f1SConrad Meyer         }
7800ac341f1SConrad Meyer         poly1305_blocks(st, st->buffer, poly1305_block_size);
7810ac341f1SConrad Meyer         st->leftover = 0;
7820ac341f1SConrad Meyer     }
7830ac341f1SConrad Meyer 
7840ac341f1SConrad Meyer     /* process full blocks */
7850ac341f1SConrad Meyer     if (bytes >= poly1305_block_size) {
7860ac341f1SConrad Meyer         unsigned long long want = (bytes & ~(poly1305_block_size - 1));
7870ac341f1SConrad Meyer 
7880ac341f1SConrad Meyer         poly1305_blocks(st, m, want);
7890ac341f1SConrad Meyer         m += want;
7900ac341f1SConrad Meyer         bytes -= want;
7910ac341f1SConrad Meyer     }
7920ac341f1SConrad Meyer 
7930ac341f1SConrad Meyer     /* store leftover */
7940ac341f1SConrad Meyer     if (bytes) {
7950ac341f1SConrad Meyer         for (i = 0; i < bytes; i++) {
7960ac341f1SConrad Meyer             st->buffer[st->leftover + i] = m[i];
7970ac341f1SConrad Meyer         }
7980ac341f1SConrad Meyer         st->leftover += bytes;
7990ac341f1SConrad Meyer     }
8000ac341f1SConrad Meyer }
8010ac341f1SConrad Meyer 
8020ac341f1SConrad Meyer static POLY1305_NOINLINE void
poly1305_finish_ext(poly1305_state_internal_t * st,const unsigned char * m,unsigned long long leftover,unsigned char mac[16])8030ac341f1SConrad Meyer poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m,
8040ac341f1SConrad Meyer                     unsigned long long leftover, unsigned char mac[16])
8050ac341f1SConrad Meyer {
8060ac341f1SConrad Meyer     uint64_t h0, h1, h2;
8070ac341f1SConrad Meyer 
8080ac341f1SConrad Meyer     if (leftover) {
8090ac341f1SConrad Meyer         CRYPTO_ALIGN(16) unsigned char final[32] = { 0 };
8100ac341f1SConrad Meyer 
8110ac341f1SConrad Meyer         poly1305_block_copy31(final, m, leftover);
8120ac341f1SConrad Meyer         if (leftover != 16) {
8130ac341f1SConrad Meyer             final[leftover] = 1;
8140ac341f1SConrad Meyer         }
8150ac341f1SConrad Meyer         st->flags |=
8160ac341f1SConrad Meyer             (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16;
8170ac341f1SConrad Meyer         poly1305_blocks(st, final, 32);
8180ac341f1SConrad Meyer     }
8190ac341f1SConrad Meyer 
8200ac341f1SConrad Meyer     if (st->flags & poly1305_started) {
8210ac341f1SConrad Meyer         /* finalize, H *= [r^2,r], or H *= [r,1] */
8220ac341f1SConrad Meyer         if (!leftover || (leftover > 16)) {
8230ac341f1SConrad Meyer             st->flags |= poly1305_final_r2_r;
8240ac341f1SConrad Meyer         } else {
8250ac341f1SConrad Meyer             st->flags |= poly1305_final_r_1;
8260ac341f1SConrad Meyer         }
8270ac341f1SConrad Meyer         poly1305_blocks(st, NULL, 32);
8280ac341f1SConrad Meyer     }
8290ac341f1SConrad Meyer 
8300ac341f1SConrad Meyer     h0 = st->H.h[0];
8310ac341f1SConrad Meyer     h1 = st->H.h[1];
8320ac341f1SConrad Meyer     h2 = st->H.h[2];
8330ac341f1SConrad Meyer 
8340ac341f1SConrad Meyer     /* pad */
8350ac341f1SConrad Meyer     h0 = ((h0) | (h1 << 44));
8360ac341f1SConrad Meyer     h1 = ((h1 >> 20) | (h2 << 24));
8370ac341f1SConrad Meyer #ifdef HAVE_AMD64_ASM
8380ac341f1SConrad Meyer     __asm__ __volatile__(
8390ac341f1SConrad Meyer         "addq %2, %0 ;\n"
8400ac341f1SConrad Meyer         "adcq %3, %1 ;\n"
8410ac341f1SConrad Meyer         : "+r"(h0), "+r"(h1)
8420ac341f1SConrad Meyer         : "r"(st->pad[0]), "r"(st->pad[1])
8430ac341f1SConrad Meyer         : "flags", "cc");
8440ac341f1SConrad Meyer #else
8450ac341f1SConrad Meyer     {
8460ac341f1SConrad Meyer         uint128_t h;
8470ac341f1SConrad Meyer 
8480ac341f1SConrad Meyer         memcpy(&h, &st->pad[0], 16);
8490ac341f1SConrad Meyer         h += ((uint128_t) h1 << 64) | h0;
8500ac341f1SConrad Meyer         h0 = (uint64_t) h;
8510ac341f1SConrad Meyer         h1 = (uint64_t)(h >> 64);
8520ac341f1SConrad Meyer     }
8530ac341f1SConrad Meyer #endif
8540ac341f1SConrad Meyer     _mm_storeu_si128((xmmi *) (void *) st + 0, _mm_setzero_si128());
8550ac341f1SConrad Meyer     _mm_storeu_si128((xmmi *) (void *) st + 1, _mm_setzero_si128());
8560ac341f1SConrad Meyer     _mm_storeu_si128((xmmi *) (void *) st + 2, _mm_setzero_si128());
8570ac341f1SConrad Meyer     _mm_storeu_si128((xmmi *) (void *) st + 3, _mm_setzero_si128());
8580ac341f1SConrad Meyer     _mm_storeu_si128((xmmi *) (void *) st + 4, _mm_setzero_si128());
8590ac341f1SConrad Meyer     _mm_storeu_si128((xmmi *) (void *) st + 5, _mm_setzero_si128());
8600ac341f1SConrad Meyer     _mm_storeu_si128((xmmi *) (void *) st + 6, _mm_setzero_si128());
8610ac341f1SConrad Meyer     _mm_storeu_si128((xmmi *) (void *) st + 7, _mm_setzero_si128());
8620ac341f1SConrad Meyer 
8630ac341f1SConrad Meyer     memcpy(&mac[0], &h0, 8);
8640ac341f1SConrad Meyer     memcpy(&mac[8], &h1, 8);
8650ac341f1SConrad Meyer 
8660ac341f1SConrad Meyer     sodium_memzero((void *) st, sizeof *st);
8670ac341f1SConrad Meyer }
8680ac341f1SConrad Meyer 
8690ac341f1SConrad Meyer static void
poly1305_finish(poly1305_state_internal_t * st,unsigned char mac[16])8700ac341f1SConrad Meyer poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
8710ac341f1SConrad Meyer {
8720ac341f1SConrad Meyer     poly1305_finish_ext(st, st->buffer, st->leftover, mac);
8730ac341f1SConrad Meyer }
8740ac341f1SConrad Meyer 
8750ac341f1SConrad Meyer static int
crypto_onetimeauth_poly1305_sse2_init(crypto_onetimeauth_poly1305_state * state,const unsigned char * key)8760ac341f1SConrad Meyer crypto_onetimeauth_poly1305_sse2_init(crypto_onetimeauth_poly1305_state *state,
8770ac341f1SConrad Meyer                                       const unsigned char *key)
8780ac341f1SConrad Meyer {
8790ac341f1SConrad Meyer     COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >=
8800ac341f1SConrad Meyer                     sizeof(poly1305_state_internal_t));
8810ac341f1SConrad Meyer     poly1305_init_ext((poly1305_state_internal_t *) (void *) state, key, 0U);
8820ac341f1SConrad Meyer 
8830ac341f1SConrad Meyer     return 0;
8840ac341f1SConrad Meyer }
8850ac341f1SConrad Meyer 
8860ac341f1SConrad Meyer static int
crypto_onetimeauth_poly1305_sse2_update(crypto_onetimeauth_poly1305_state * state,const unsigned char * in,unsigned long long inlen)8870ac341f1SConrad Meyer crypto_onetimeauth_poly1305_sse2_update(
8880ac341f1SConrad Meyer     crypto_onetimeauth_poly1305_state *state, const unsigned char *in,
8890ac341f1SConrad Meyer     unsigned long long inlen)
8900ac341f1SConrad Meyer {
8910ac341f1SConrad Meyer     poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen);
8920ac341f1SConrad Meyer 
8930ac341f1SConrad Meyer     return 0;
8940ac341f1SConrad Meyer }
8950ac341f1SConrad Meyer 
8960ac341f1SConrad Meyer static int
crypto_onetimeauth_poly1305_sse2_final(crypto_onetimeauth_poly1305_state * state,unsigned char * out)8970ac341f1SConrad Meyer crypto_onetimeauth_poly1305_sse2_final(crypto_onetimeauth_poly1305_state *state,
8980ac341f1SConrad Meyer                                        unsigned char *out)
8990ac341f1SConrad Meyer {
9000ac341f1SConrad Meyer     poly1305_finish((poly1305_state_internal_t *) (void *) state, out);
9010ac341f1SConrad Meyer 
9020ac341f1SConrad Meyer     return 0;
9030ac341f1SConrad Meyer }
9040ac341f1SConrad Meyer 
9050ac341f1SConrad Meyer static int
crypto_onetimeauth_poly1305_sse2(unsigned char * out,const unsigned char * m,unsigned long long inlen,const unsigned char * key)9060ac341f1SConrad Meyer crypto_onetimeauth_poly1305_sse2(unsigned char *out, const unsigned char *m,
9070ac341f1SConrad Meyer                                  unsigned long long   inlen,
9080ac341f1SConrad Meyer                                  const unsigned char *key)
9090ac341f1SConrad Meyer {
9100ac341f1SConrad Meyer     CRYPTO_ALIGN(64) poly1305_state_internal_t st;
9110ac341f1SConrad Meyer     unsigned long long                         blocks;
9120ac341f1SConrad Meyer 
9130ac341f1SConrad Meyer     poly1305_init_ext(&st, key, inlen);
9140ac341f1SConrad Meyer     blocks = inlen & ~31;
9150ac341f1SConrad Meyer     if (blocks > 0) {
9160ac341f1SConrad Meyer         poly1305_blocks(&st, m, blocks);
9170ac341f1SConrad Meyer         m += blocks;
9180ac341f1SConrad Meyer         inlen -= blocks;
9190ac341f1SConrad Meyer     }
9200ac341f1SConrad Meyer     poly1305_finish_ext(&st, m, inlen, out);
9210ac341f1SConrad Meyer 
9220ac341f1SConrad Meyer     return 0;
9230ac341f1SConrad Meyer }
9240ac341f1SConrad Meyer 
9250ac341f1SConrad Meyer static int
crypto_onetimeauth_poly1305_sse2_verify(const unsigned char * h,const unsigned char * in,unsigned long long inlen,const unsigned char * k)9260ac341f1SConrad Meyer crypto_onetimeauth_poly1305_sse2_verify(const unsigned char *h,
9270ac341f1SConrad Meyer                                         const unsigned char *in,
9280ac341f1SConrad Meyer                                         unsigned long long   inlen,
9290ac341f1SConrad Meyer                                         const unsigned char *k)
9300ac341f1SConrad Meyer {
9310ac341f1SConrad Meyer     unsigned char correct[16];
9320ac341f1SConrad Meyer 
9330ac341f1SConrad Meyer     crypto_onetimeauth_poly1305_sse2(correct, in, inlen, k);
9340ac341f1SConrad Meyer 
9350ac341f1SConrad Meyer     return crypto_verify_16(h, correct);
9360ac341f1SConrad Meyer }
9370ac341f1SConrad Meyer 
9380ac341f1SConrad Meyer struct crypto_onetimeauth_poly1305_implementation
9390ac341f1SConrad Meyer     crypto_onetimeauth_poly1305_sse2_implementation = {
9400ac341f1SConrad Meyer         SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_sse2,
9410ac341f1SConrad Meyer         SODIUM_C99(.onetimeauth_verify =)
9420ac341f1SConrad Meyer             crypto_onetimeauth_poly1305_sse2_verify,
9430ac341f1SConrad Meyer         SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_sse2_init,
9440ac341f1SConrad Meyer         SODIUM_C99(.onetimeauth_update =)
9450ac341f1SConrad Meyer             crypto_onetimeauth_poly1305_sse2_update,
9460ac341f1SConrad Meyer         SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_sse2_final
9470ac341f1SConrad Meyer     };
9480ac341f1SConrad Meyer 
9490ac341f1SConrad Meyer #endif
950