1 
2 #include <stdint.h>
3 #include <string.h>
4 
5 #include "../onetimeauth_poly1305.h"
6 #include "crypto_verify_16.h"
7 #include "poly1305_sse2.h"
8 #include "private/common.h"
9 #include "private/sse2_64_32.h"
10 #include "utils.h"
11 
12 #if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H)
13 
14 # ifdef __GNUC__
15 #  pragma GCC target("sse2")
16 # endif
17 
18 # include <emmintrin.h>
19 
20 typedef __m128i xmmi;
21 
22 # if defined(_MSC_VER)
23 #  define POLY1305_NOINLINE __declspec(noinline)
24 # elif defined(__clang__) || defined(__GNUC__)
25 #  define POLY1305_NOINLINE __attribute__((noinline))
26 # else
27 #  define POLY1305_NOINLINE
28 # endif
29 
30 # define poly1305_block_size 32
31 
32 enum poly1305_state_flags_t {
33     poly1305_started       = 1,
34     poly1305_final_shift8  = 4,
35     poly1305_final_shift16 = 8,
36     poly1305_final_r2_r    = 16, /* use [r^2,r] for the final block */
37     poly1305_final_r_1     = 32  /* use [r,1] for the final block */
38 };
39 
40 typedef struct poly1305_state_internal_t {
41     union {
42         uint64_t h[3];
43         uint32_t hh[10];
44     } H;                                            /*  40 bytes  */
45     uint32_t           R[5];                        /*  20 bytes  */
46     uint32_t           R2[5];                       /*  20 bytes  */
47     uint32_t           R4[5];                       /*  20 bytes  */
48     uint64_t           pad[2];                      /*  16 bytes  */
49     uint64_t           flags;                       /*   8 bytes  */
50     unsigned long long leftover;                    /* 8 bytes */
51     unsigned char      buffer[poly1305_block_size]; /* 32 bytes */
52 } poly1305_state_internal_t;                        /* 164 bytes total */
53 
54 /*
55  * _mm_loadl_epi64() is turned into a simple MOVQ. So, unaligned accesses are
56  * totally fine, even though this intrinsic requires a __m128i* input.
57  * This confuses dynamic analysis, so force alignment, only in debug mode.
58  */
59 # ifdef DEBUG
60 static xmmi
61 _fakealign_mm_loadl_epi64(const void *m)
62 {
63     xmmi tmp;
64     memcpy(&tmp, m, 8);
65 
66     return _mm_loadl_epi64(&tmp);
67 }
68 # define _mm_loadl_epi64(X) _fakealign_mm_loadl_epi64(X)
69 #endif
70 
71 /* copy 0-31 bytes */
72 static inline void
73 poly1305_block_copy31(unsigned char *dst, const unsigned char *src,
74                       unsigned long long bytes)
75 {
76     if (bytes & 16) {
77         _mm_store_si128((xmmi *) (void *) dst,
78                         _mm_loadu_si128((const xmmi *) (const void *) src));
79         src += 16;
80         dst += 16;
81     }
82     if (bytes & 8) {
83         memcpy(dst, src, 8);
84         src += 8;
85         dst += 8;
86     }
87     if (bytes & 4) {
88         memcpy(dst, src, 4);
89         src += 4;
90         dst += 4;
91     }
92     if (bytes & 2) {
93         memcpy(dst, src, 2);
94         src += 2;
95         dst += 2;
96     }
97     if (bytes & 1) {
98         *dst = *src;
99     }
100 }
101 
102 static POLY1305_NOINLINE void
103 poly1305_init_ext(poly1305_state_internal_t *st, const unsigned char key[32],
104                   unsigned long long bytes)
105 {
106     uint32_t          *R;
107     uint128_t          d[3];
108     uint64_t           r0, r1, r2;
109     uint64_t           rt0, rt1, rt2, st2, c;
110     uint64_t           t0, t1;
111     unsigned long long i;
112 
113     if (!bytes) {
114         bytes = ~(unsigned long long) 0;
115     }
116     /* H = 0 */
117     _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], _mm_setzero_si128());
118     _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], _mm_setzero_si128());
119     _mm_storeu_si128((xmmi *) (void *) &st->H.hh[8], _mm_setzero_si128());
120 
121     /* clamp key */
122     memcpy(&t0, key, 8);
123     memcpy(&t1, key + 8, 8);
124     r0 = t0 & 0xffc0fffffff;
125     t0 >>= 44;
126     t0 |= t1 << 20;
127     r1 = t0 & 0xfffffc0ffff;
128     t1 >>= 24;
129     r2 = t1 & 0x00ffffffc0f;
130 
131     /* r^1 */
132     R    = st->R;
133     R[0] = (uint32_t)(r0) &0x3ffffff;
134     R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
135     R[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
136     R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
137     R[4] = (uint32_t)((r2 >> 16));
138 
139     /* save pad */
140     memcpy(&st->pad[0], key + 16, 8);
141     memcpy(&st->pad[1], key + 24, 8);
142 
143     rt0 = r0;
144     rt1 = r1;
145     rt2 = r2;
146 
147     /* r^2, r^4 */
148     for (i = 0; i < 2; i++) {
149         if (i == 0) {
150             R = st->R2;
151             if (bytes <= 16) {
152                 break;
153             }
154         } else if (i == 1) {
155             R = st->R4;
156             if (bytes < 96) {
157                 break;
158             }
159         }
160         st2 = rt2 * (5 << 2);
161 
162         d[0] = ((uint128_t) rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2);
163         d[1] = ((uint128_t) rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1);
164         d[2] = ((uint128_t) rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0);
165 
166         rt0 = (uint64_t) d[0] & 0xfffffffffff;
167         c   = (uint64_t)(d[0] >> 44);
168         d[1] += c;
169 
170         rt1 = (uint64_t) d[1] & 0xfffffffffff;
171         c   = (uint64_t)(d[1] >> 44);
172         d[2] += c;
173 
174         rt2 = (uint64_t) d[2] & 0x3ffffffffff;
175         c   = (uint64_t)(d[2] >> 42);
176         rt0 += c * 5;
177         c   = (rt0 >> 44);
178         rt0 = rt0 & 0xfffffffffff;
179         rt1 += c;
180         c   = (rt1 >> 44);
181         rt1 = rt1 & 0xfffffffffff;
182         rt2 += c; /* even if rt2 overflows, it will still fit in rp4 safely, and
183                      is safe to multiply with */
184 
185         R[0] = (uint32_t)(rt0) &0x3ffffff;
186         R[1] = (uint32_t)((rt0 >> 26) | (rt1 << 18)) & 0x3ffffff;
187         R[2] = (uint32_t)((rt1 >> 8)) & 0x3ffffff;
188         R[3] = (uint32_t)((rt1 >> 34) | (rt2 << 10)) & 0x3ffffff;
189         R[4] = (uint32_t)((rt2 >> 16));
190     }
191     st->flags    = 0;
192     st->leftover = 0U;
193 }
194 
195 static POLY1305_NOINLINE void
196 poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
197                 unsigned long long bytes)
198 {
199     CRYPTO_ALIGN(64)
200     xmmi HIBIT =
201         _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1, 0, 1, 0));
202     const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1),
203                                          _MM_SHUFFLE(1, 0, 1, 0));
204     const xmmi FIVE =
205         _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1, 0, 1, 0));
206     xmmi H0, H1, H2, H3, H4;
207     xmmi T0, T1, T2, T3, T4, T5, T6, T7, T8;
208     xmmi M0, M1, M2, M3, M4;
209     xmmi M5, M6, M7, M8;
210     xmmi C1, C2;
211     xmmi R20, R21, R22, R23, R24, S21, S22, S23, S24;
212     xmmi R40, R41, R42, R43, R44, S41, S42, S43, S44;
213 
214     if (st->flags & poly1305_final_shift8) {
215         HIBIT = _mm_srli_si128(HIBIT, 8);
216     }
217     if (st->flags & poly1305_final_shift16) {
218         HIBIT = _mm_setzero_si128();
219     }
220     if (!(st->flags & poly1305_started)) {
221         /* H = [Mx,My] */
222         T5 = _mm_unpacklo_epi64(
223             _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
224             _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
225         T6 = _mm_unpacklo_epi64(
226             _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
227             _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
228         H0 = _mm_and_si128(MMASK, T5);
229         H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
230         T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
231         H2 = _mm_and_si128(MMASK, T5);
232         H3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
233         H4 = _mm_srli_epi64(T6, 40);
234         H4 = _mm_or_si128(H4, HIBIT);
235         m += 32;
236         bytes -= 32;
237         st->flags |= poly1305_started;
238     } else {
239         T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[0]);
240         T1 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[4]);
241         T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[8]);
242         H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 0, 0));
243         H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 2, 2));
244         H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1, 1, 0, 0));
245         H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 3, 2, 2));
246         H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1, 1, 0, 0));
247     }
248     if (st->flags & (poly1305_final_r2_r | poly1305_final_r_1)) {
249         if (st->flags & poly1305_final_r2_r) {
250             /* use [r^2, r] */
251             T2  = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
252             T3  = _mm_cvtsi32_si128(st->R[4]);
253             T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
254             T1  = _mm_cvtsi32_si128(st->R2[4]);
255             T4  = _mm_unpacklo_epi32(T0, T2);
256             T5  = _mm_unpackhi_epi32(T0, T2);
257             R24 = _mm_unpacklo_epi64(T1, T3);
258         } else {
259             /* use [r^1, 1] */
260             T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
261             T1  = _mm_cvtsi32_si128(st->R[4]);
262             T2  = _mm_cvtsi32_si128(1);
263             T4  = _mm_unpacklo_epi32(T0, T2);
264             T5  = _mm_unpackhi_epi32(T0, T2);
265             R24 = T1;
266         }
267         R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1, 1, 0, 0));
268         R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3, 3, 2, 2));
269         R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1, 1, 0, 0));
270         R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3, 3, 2, 2));
271     } else {
272         /* use [r^2, r^2] */
273         T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
274         T1  = _mm_cvtsi32_si128(st->R2[4]);
275         R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
276         R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
277         R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
278         R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
279         R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
280     }
281     S21 = _mm_mul_epu32(R21, FIVE);
282     S22 = _mm_mul_epu32(R22, FIVE);
283     S23 = _mm_mul_epu32(R23, FIVE);
284     S24 = _mm_mul_epu32(R24, FIVE);
285 
286     if (bytes >= 64) {
287         T0  = _mm_loadu_si128((const xmmi *) (const void *) &st->R4[0]);
288         T1  = _mm_cvtsi32_si128(st->R4[4]);
289         R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
290         R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
291         R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
292         R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
293         R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
294         S41 = _mm_mul_epu32(R41, FIVE);
295         S42 = _mm_mul_epu32(R42, FIVE);
296         S43 = _mm_mul_epu32(R43, FIVE);
297         S44 = _mm_mul_epu32(R44, FIVE);
298 
299         while (bytes >= 64) {
300             xmmi v00, v01, v02, v03, v04;
301             xmmi v10, v11, v12, v13, v14;
302             xmmi v20, v21, v22, v23, v24;
303             xmmi v30, v31, v32, v33, v34;
304             xmmi v40, v41, v42, v43, v44;
305             xmmi T14, T15;
306 
307             /* H *= [r^4,r^4], preload [Mx,My] */
308             T15 = S42;
309             T0  = H4;
310             T0  = _mm_mul_epu32(T0, S41);
311             v01 = H3;
312             v01 = _mm_mul_epu32(v01, T15);
313             T14 = S43;
314             T1  = H4;
315             T1  = _mm_mul_epu32(T1, T15);
316             v11 = H3;
317             v11 = _mm_mul_epu32(v11, T14);
318             T2  = H4;
319             T2  = _mm_mul_epu32(T2, T14);
320             T0  = _mm_add_epi64(T0, v01);
321             T15 = S44;
322             v02 = H2;
323             v02 = _mm_mul_epu32(v02, T14);
324             T3  = H4;
325             T3  = _mm_mul_epu32(T3, T15);
326             T1  = _mm_add_epi64(T1, v11);
327             v03 = H1;
328             v03 = _mm_mul_epu32(v03, T15);
329             v12 = H2;
330             v12 = _mm_mul_epu32(v12, T15);
331             T0  = _mm_add_epi64(T0, v02);
332             T14 = R40;
333             v21 = H3;
334             v21 = _mm_mul_epu32(v21, T15);
335             v31 = H3;
336             v31 = _mm_mul_epu32(v31, T14);
337             T0  = _mm_add_epi64(T0, v03);
338             T4  = H4;
339             T4  = _mm_mul_epu32(T4, T14);
340             T1  = _mm_add_epi64(T1, v12);
341             v04 = H0;
342             v04 = _mm_mul_epu32(v04, T14);
343             T2  = _mm_add_epi64(T2, v21);
344             v13 = H1;
345             v13 = _mm_mul_epu32(v13, T14);
346             T3  = _mm_add_epi64(T3, v31);
347             T15 = R41;
348             v22 = H2;
349             v22 = _mm_mul_epu32(v22, T14);
350             v32 = H2;
351             v32 = _mm_mul_epu32(v32, T15);
352             T0  = _mm_add_epi64(T0, v04);
353             v41 = H3;
354             v41 = _mm_mul_epu32(v41, T15);
355             T1  = _mm_add_epi64(T1, v13);
356             v14 = H0;
357             v14 = _mm_mul_epu32(v14, T15);
358             T2  = _mm_add_epi64(T2, v22);
359             T14 = R42;
360             T5  = _mm_unpacklo_epi64(
361                 _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
362                 _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
363             v23 = H1;
364             v23 = _mm_mul_epu32(v23, T15);
365             T3  = _mm_add_epi64(T3, v32);
366             v33 = H1;
367             v33 = _mm_mul_epu32(v33, T14);
368             T4  = _mm_add_epi64(T4, v41);
369             v42 = H2;
370             v42 = _mm_mul_epu32(v42, T14);
371             T1  = _mm_add_epi64(T1, v14);
372             T15 = R43;
373             T6  = _mm_unpacklo_epi64(
374                 _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
375                 _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
376             v24 = H0;
377             v24 = _mm_mul_epu32(v24, T14);
378             T2  = _mm_add_epi64(T2, v23);
379             v34 = H0;
380             v34 = _mm_mul_epu32(v34, T15);
381             T3  = _mm_add_epi64(T3, v33);
382             M0  = _mm_and_si128(MMASK, T5);
383             v43 = H1;
384             v43 = _mm_mul_epu32(v43, T15);
385             T4  = _mm_add_epi64(T4, v42);
386             M1  = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
387             v44 = H0;
388             v44 = _mm_mul_epu32(v44, R44);
389             T2  = _mm_add_epi64(T2, v24);
390             T5  = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
391             T3  = _mm_add_epi64(T3, v34);
392             M3  = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14));
393             T4  = _mm_add_epi64(T4, v43);
394             M2  = _mm_and_si128(MMASK, T5);
395             T4  = _mm_add_epi64(T4, v44);
396             M4  = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
397 
398             /* H += [Mx',My'] */
399             T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 32));
400             T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 48));
401             T7 = _mm_unpacklo_epi32(T5, T6);
402             T8 = _mm_unpackhi_epi32(T5, T6);
403             M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
404             M6 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
405             M7 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
406             M8 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
407             M6 = _mm_slli_epi64(M6, 6);
408             M7 = _mm_slli_epi64(M7, 12);
409             M8 = _mm_slli_epi64(M8, 18);
410             T0 = _mm_add_epi64(T0, M5);
411             T1 = _mm_add_epi64(T1, M6);
412             T2 = _mm_add_epi64(T2, M7);
413             T3 = _mm_add_epi64(T3, M8);
414             T4 = _mm_add_epi64(T4, HIBIT);
415 
416             /* H += [Mx,My]*[r^2,r^2] */
417             T15 = S22;
418             v00 = M4;
419             v00 = _mm_mul_epu32(v00, S21);
420             v01 = M3;
421             v01 = _mm_mul_epu32(v01, T15);
422             T14 = S23;
423             v10 = M4;
424             v10 = _mm_mul_epu32(v10, T15);
425             v11 = M3;
426             v11 = _mm_mul_epu32(v11, T14);
427             T0  = _mm_add_epi64(T0, v00);
428             v20 = M4;
429             v20 = _mm_mul_epu32(v20, T14);
430             T0  = _mm_add_epi64(T0, v01);
431             T15 = S24;
432             v02 = M2;
433             v02 = _mm_mul_epu32(v02, T14);
434             T1  = _mm_add_epi64(T1, v10);
435             v30 = M4;
436             v30 = _mm_mul_epu32(v30, T15);
437             T1  = _mm_add_epi64(T1, v11);
438             v03 = M1;
439             v03 = _mm_mul_epu32(v03, T15);
440             T2  = _mm_add_epi64(T2, v20);
441             v12 = M2;
442             v12 = _mm_mul_epu32(v12, T15);
443             T0  = _mm_add_epi64(T0, v02);
444             T14 = R20;
445             v21 = M3;
446             v21 = _mm_mul_epu32(v21, T15);
447             T3  = _mm_add_epi64(T3, v30);
448             v31 = M3;
449             v31 = _mm_mul_epu32(v31, T14);
450             T0  = _mm_add_epi64(T0, v03);
451             v40 = M4;
452             v40 = _mm_mul_epu32(v40, T14);
453             T1  = _mm_add_epi64(T1, v12);
454             v04 = M0;
455             v04 = _mm_mul_epu32(v04, T14);
456             T2  = _mm_add_epi64(T2, v21);
457             v13 = M1;
458             v13 = _mm_mul_epu32(v13, T14);
459             T3  = _mm_add_epi64(T3, v31);
460             T15 = R21;
461             v22 = M2;
462             v22 = _mm_mul_epu32(v22, T14);
463             T4  = _mm_add_epi64(T4, v40);
464             v32 = M2;
465             v32 = _mm_mul_epu32(v32, T15);
466             T0  = _mm_add_epi64(T0, v04);
467             v41 = M3;
468             v41 = _mm_mul_epu32(v41, T15);
469             T1  = _mm_add_epi64(T1, v13);
470             v14 = M0;
471             v14 = _mm_mul_epu32(v14, T15);
472             T2  = _mm_add_epi64(T2, v22);
473             T14 = R22;
474             v23 = M1;
475             v23 = _mm_mul_epu32(v23, T15);
476             T3  = _mm_add_epi64(T3, v32);
477             v33 = M1;
478             v33 = _mm_mul_epu32(v33, T14);
479             T4  = _mm_add_epi64(T4, v41);
480             v42 = M2;
481             v42 = _mm_mul_epu32(v42, T14);
482             T1  = _mm_add_epi64(T1, v14);
483             T15 = R23;
484             v24 = M0;
485             v24 = _mm_mul_epu32(v24, T14);
486             T2  = _mm_add_epi64(T2, v23);
487             v34 = M0;
488             v34 = _mm_mul_epu32(v34, T15);
489             T3  = _mm_add_epi64(T3, v33);
490             v43 = M1;
491             v43 = _mm_mul_epu32(v43, T15);
492             T4  = _mm_add_epi64(T4, v42);
493             v44 = M0;
494             v44 = _mm_mul_epu32(v44, R24);
495             T2  = _mm_add_epi64(T2, v24);
496             T3  = _mm_add_epi64(T3, v34);
497             T4  = _mm_add_epi64(T4, v43);
498             T4  = _mm_add_epi64(T4, v44);
499 
500             /* reduce */
501             C1 = _mm_srli_epi64(T0, 26);
502             C2 = _mm_srli_epi64(T3, 26);
503             T0 = _mm_and_si128(T0, MMASK);
504             T3 = _mm_and_si128(T3, MMASK);
505             T1 = _mm_add_epi64(T1, C1);
506             T4 = _mm_add_epi64(T4, C2);
507             C1 = _mm_srli_epi64(T1, 26);
508             C2 = _mm_srli_epi64(T4, 26);
509             T1 = _mm_and_si128(T1, MMASK);
510             T4 = _mm_and_si128(T4, MMASK);
511             T2 = _mm_add_epi64(T2, C1);
512             T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
513             C1 = _mm_srli_epi64(T2, 26);
514             C2 = _mm_srli_epi64(T0, 26);
515             T2 = _mm_and_si128(T2, MMASK);
516             T0 = _mm_and_si128(T0, MMASK);
517             T3 = _mm_add_epi64(T3, C1);
518             T1 = _mm_add_epi64(T1, C2);
519             C1 = _mm_srli_epi64(T3, 26);
520             T3 = _mm_and_si128(T3, MMASK);
521             T4 = _mm_add_epi64(T4, C1);
522 
523             /* Final: H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx',My']) */
524             H0 = T0;
525             H1 = T1;
526             H2 = T2;
527             H3 = T3;
528             H4 = T4;
529 
530             m += 64;
531             bytes -= 64;
532         }
533     }
534 
535     if (bytes >= 32) {
536         xmmi v01, v02, v03, v04;
537         xmmi v11, v12, v13, v14;
538         xmmi v21, v22, v23, v24;
539         xmmi v31, v32, v33, v34;
540         xmmi v41, v42, v43, v44;
541         xmmi T14, T15;
542 
543         /* H *= [r^2,r^2] */
544         T15 = S22;
545         T0  = H4;
546         T0  = _mm_mul_epu32(T0, S21);
547         v01 = H3;
548         v01 = _mm_mul_epu32(v01, T15);
549         T14 = S23;
550         T1  = H4;
551         T1  = _mm_mul_epu32(T1, T15);
552         v11 = H3;
553         v11 = _mm_mul_epu32(v11, T14);
554         T2  = H4;
555         T2  = _mm_mul_epu32(T2, T14);
556         T0  = _mm_add_epi64(T0, v01);
557         T15 = S24;
558         v02 = H2;
559         v02 = _mm_mul_epu32(v02, T14);
560         T3  = H4;
561         T3  = _mm_mul_epu32(T3, T15);
562         T1  = _mm_add_epi64(T1, v11);
563         v03 = H1;
564         v03 = _mm_mul_epu32(v03, T15);
565         v12 = H2;
566         v12 = _mm_mul_epu32(v12, T15);
567         T0  = _mm_add_epi64(T0, v02);
568         T14 = R20;
569         v21 = H3;
570         v21 = _mm_mul_epu32(v21, T15);
571         v31 = H3;
572         v31 = _mm_mul_epu32(v31, T14);
573         T0  = _mm_add_epi64(T0, v03);
574         T4  = H4;
575         T4  = _mm_mul_epu32(T4, T14);
576         T1  = _mm_add_epi64(T1, v12);
577         v04 = H0;
578         v04 = _mm_mul_epu32(v04, T14);
579         T2  = _mm_add_epi64(T2, v21);
580         v13 = H1;
581         v13 = _mm_mul_epu32(v13, T14);
582         T3  = _mm_add_epi64(T3, v31);
583         T15 = R21;
584         v22 = H2;
585         v22 = _mm_mul_epu32(v22, T14);
586         v32 = H2;
587         v32 = _mm_mul_epu32(v32, T15);
588         T0  = _mm_add_epi64(T0, v04);
589         v41 = H3;
590         v41 = _mm_mul_epu32(v41, T15);
591         T1  = _mm_add_epi64(T1, v13);
592         v14 = H0;
593         v14 = _mm_mul_epu32(v14, T15);
594         T2  = _mm_add_epi64(T2, v22);
595         T14 = R22;
596         v23 = H1;
597         v23 = _mm_mul_epu32(v23, T15);
598         T3  = _mm_add_epi64(T3, v32);
599         v33 = H1;
600         v33 = _mm_mul_epu32(v33, T14);
601         T4  = _mm_add_epi64(T4, v41);
602         v42 = H2;
603         v42 = _mm_mul_epu32(v42, T14);
604         T1  = _mm_add_epi64(T1, v14);
605         T15 = R23;
606         v24 = H0;
607         v24 = _mm_mul_epu32(v24, T14);
608         T2  = _mm_add_epi64(T2, v23);
609         v34 = H0;
610         v34 = _mm_mul_epu32(v34, T15);
611         T3  = _mm_add_epi64(T3, v33);
612         v43 = H1;
613         v43 = _mm_mul_epu32(v43, T15);
614         T4  = _mm_add_epi64(T4, v42);
615         v44 = H0;
616         v44 = _mm_mul_epu32(v44, R24);
617         T2  = _mm_add_epi64(T2, v24);
618         T3  = _mm_add_epi64(T3, v34);
619         T4  = _mm_add_epi64(T4, v43);
620         T4  = _mm_add_epi64(T4, v44);
621 
622         /* H += [Mx,My] */
623         if (m) {
624             T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 0));
625             T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 16));
626             T7 = _mm_unpacklo_epi32(T5, T6);
627             T8 = _mm_unpackhi_epi32(T5, T6);
628             M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
629             M1 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
630             M2 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
631             M3 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
632             M1 = _mm_slli_epi64(M1, 6);
633             M2 = _mm_slli_epi64(M2, 12);
634             M3 = _mm_slli_epi64(M3, 18);
635             T0 = _mm_add_epi64(T0, M0);
636             T1 = _mm_add_epi64(T1, M1);
637             T2 = _mm_add_epi64(T2, M2);
638             T3 = _mm_add_epi64(T3, M3);
639             T4 = _mm_add_epi64(T4, HIBIT);
640         }
641 
642         /* reduce */
643         C1 = _mm_srli_epi64(T0, 26);
644         C2 = _mm_srli_epi64(T3, 26);
645         T0 = _mm_and_si128(T0, MMASK);
646         T3 = _mm_and_si128(T3, MMASK);
647         T1 = _mm_add_epi64(T1, C1);
648         T4 = _mm_add_epi64(T4, C2);
649         C1 = _mm_srli_epi64(T1, 26);
650         C2 = _mm_srli_epi64(T4, 26);
651         T1 = _mm_and_si128(T1, MMASK);
652         T4 = _mm_and_si128(T4, MMASK);
653         T2 = _mm_add_epi64(T2, C1);
654         T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
655         C1 = _mm_srli_epi64(T2, 26);
656         C2 = _mm_srli_epi64(T0, 26);
657         T2 = _mm_and_si128(T2, MMASK);
658         T0 = _mm_and_si128(T0, MMASK);
659         T3 = _mm_add_epi64(T3, C1);
660         T1 = _mm_add_epi64(T1, C2);
661         C1 = _mm_srli_epi64(T3, 26);
662         T3 = _mm_and_si128(T3, MMASK);
663         T4 = _mm_add_epi64(T4, C1);
664 
665         /* H = (H*[r^2,r^2] + [Mx,My]) */
666         H0 = T0;
667         H1 = T1;
668         H2 = T2;
669         H3 = T3;
670         H4 = T4;
671     }
672 
673     if (m) {
674         T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0, 0, 2, 0));
675         T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0, 0, 2, 0));
676         T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0, 0, 2, 0));
677         T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0, 0, 2, 0));
678         T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0, 0, 2, 0));
679         T0 = _mm_unpacklo_epi64(T0, T1);
680         T1 = _mm_unpacklo_epi64(T2, T3);
681         _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], T0);
682         _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], T1);
683         _mm_storel_epi64((xmmi *) (void *) &st->H.hh[8], T4);
684     } else {
685         uint32_t t0, t1, t2, t3, t4, b;
686         uint64_t h0, h1, h2, g0, g1, g2, c, nc;
687 
688         /* H = H[0]+H[1] */
689         T0 = H0;
690         T1 = H1;
691         T2 = H2;
692         T3 = H3;
693         T4 = H4;
694 
695         T0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
696         T1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
697         T2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
698         T3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
699         T4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
700 
701         t0 = _mm_cvtsi128_si32(T0);
702         b  = (t0 >> 26);
703         t0 &= 0x3ffffff;
704         t1 = _mm_cvtsi128_si32(T1) + b;
705         b  = (t1 >> 26);
706         t1 &= 0x3ffffff;
707         t2 = _mm_cvtsi128_si32(T2) + b;
708         b  = (t2 >> 26);
709         t2 &= 0x3ffffff;
710         t3 = _mm_cvtsi128_si32(T3) + b;
711         b  = (t3 >> 26);
712         t3 &= 0x3ffffff;
713         t4 = _mm_cvtsi128_si32(T4) + b;
714 
715         /* everything except t4 is in range, so this is all safe */
716         h0 = (((uint64_t) t0) | ((uint64_t) t1 << 26)) & 0xfffffffffffull;
717         h1 = (((uint64_t) t1 >> 18) | ((uint64_t) t2 << 8) |
718               ((uint64_t) t3 << 34)) &
719              0xfffffffffffull;
720         h2 = (((uint64_t) t3 >> 10) | ((uint64_t) t4 << 16));
721 
722         c = (h2 >> 42);
723         h2 &= 0x3ffffffffff;
724         h0 += c * 5;
725         c = (h0 >> 44);
726         h0 &= 0xfffffffffff;
727         h1 += c;
728         c = (h1 >> 44);
729         h1 &= 0xfffffffffff;
730         h2 += c;
731         c = (h2 >> 42);
732         h2 &= 0x3ffffffffff;
733         h0 += c * 5;
734         c = (h0 >> 44);
735         h0 &= 0xfffffffffff;
736         h1 += c;
737 
738         g0 = h0 + 5;
739         c  = (g0 >> 44);
740         g0 &= 0xfffffffffff;
741         g1 = h1 + c;
742         c  = (g1 >> 44);
743         g1 &= 0xfffffffffff;
744         g2 = h2 + c - ((uint64_t) 1 << 42);
745 
746         c  = (g2 >> 63) - 1;
747         nc = ~c;
748         h0 = (h0 & nc) | (g0 & c);
749         h1 = (h1 & nc) | (g1 & c);
750         h2 = (h2 & nc) | (g2 & c);
751 
752         st->H.h[0] = h0;
753         st->H.h[1] = h1;
754         st->H.h[2] = h2;
755     }
756 }
757 
758 static void
759 poly1305_update(poly1305_state_internal_t *st, const unsigned char *m,
760                 unsigned long long bytes)
761 {
762     unsigned long long i;
763 
764     /* handle leftover */
765     if (st->leftover) {
766         unsigned long long want = (poly1305_block_size - st->leftover);
767 
768         if (want > bytes) {
769             want = bytes;
770         }
771         for (i = 0; i < want; i++) {
772             st->buffer[st->leftover + i] = m[i];
773         }
774         bytes -= want;
775         m += want;
776         st->leftover += want;
777         if (st->leftover < poly1305_block_size) {
778             return;
779         }
780         poly1305_blocks(st, st->buffer, poly1305_block_size);
781         st->leftover = 0;
782     }
783 
784     /* process full blocks */
785     if (bytes >= poly1305_block_size) {
786         unsigned long long want = (bytes & ~(poly1305_block_size - 1));
787 
788         poly1305_blocks(st, m, want);
789         m += want;
790         bytes -= want;
791     }
792 
793     /* store leftover */
794     if (bytes) {
795         for (i = 0; i < bytes; i++) {
796             st->buffer[st->leftover + i] = m[i];
797         }
798         st->leftover += bytes;
799     }
800 }
801 
802 static POLY1305_NOINLINE void
803 poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m,
804                     unsigned long long leftover, unsigned char mac[16])
805 {
806     uint64_t h0, h1, h2;
807 
808     if (leftover) {
809         CRYPTO_ALIGN(16) unsigned char final[32] = { 0 };
810 
811         poly1305_block_copy31(final, m, leftover);
812         if (leftover != 16) {
813             final[leftover] = 1;
814         }
815         st->flags |=
816             (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16;
817         poly1305_blocks(st, final, 32);
818     }
819 
820     if (st->flags & poly1305_started) {
821         /* finalize, H *= [r^2,r], or H *= [r,1] */
822         if (!leftover || (leftover > 16)) {
823             st->flags |= poly1305_final_r2_r;
824         } else {
825             st->flags |= poly1305_final_r_1;
826         }
827         poly1305_blocks(st, NULL, 32);
828     }
829 
830     h0 = st->H.h[0];
831     h1 = st->H.h[1];
832     h2 = st->H.h[2];
833 
834     /* pad */
835     h0 = ((h0) | (h1 << 44));
836     h1 = ((h1 >> 20) | (h2 << 24));
837 #ifdef HAVE_AMD64_ASM
838     __asm__ __volatile__(
839         "addq %2, %0 ;\n"
840         "adcq %3, %1 ;\n"
841         : "+r"(h0), "+r"(h1)
842         : "r"(st->pad[0]), "r"(st->pad[1])
843         : "flags", "cc");
844 #else
845     {
846         uint128_t h;
847 
848         memcpy(&h, &st->pad[0], 16);
849         h += ((uint128_t) h1 << 64) | h0;
850         h0 = (uint64_t) h;
851         h1 = (uint64_t)(h >> 64);
852     }
853 #endif
854     _mm_storeu_si128((xmmi *) (void *) st + 0, _mm_setzero_si128());
855     _mm_storeu_si128((xmmi *) (void *) st + 1, _mm_setzero_si128());
856     _mm_storeu_si128((xmmi *) (void *) st + 2, _mm_setzero_si128());
857     _mm_storeu_si128((xmmi *) (void *) st + 3, _mm_setzero_si128());
858     _mm_storeu_si128((xmmi *) (void *) st + 4, _mm_setzero_si128());
859     _mm_storeu_si128((xmmi *) (void *) st + 5, _mm_setzero_si128());
860     _mm_storeu_si128((xmmi *) (void *) st + 6, _mm_setzero_si128());
861     _mm_storeu_si128((xmmi *) (void *) st + 7, _mm_setzero_si128());
862 
863     memcpy(&mac[0], &h0, 8);
864     memcpy(&mac[8], &h1, 8);
865 
866     sodium_memzero((void *) st, sizeof *st);
867 }
868 
869 static void
870 poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
871 {
872     poly1305_finish_ext(st, st->buffer, st->leftover, mac);
873 }
874 
875 static int
876 crypto_onetimeauth_poly1305_sse2_init(crypto_onetimeauth_poly1305_state *state,
877                                       const unsigned char *key)
878 {
879     COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >=
880                     sizeof(poly1305_state_internal_t));
881     poly1305_init_ext((poly1305_state_internal_t *) (void *) state, key, 0U);
882 
883     return 0;
884 }
885 
886 static int
887 crypto_onetimeauth_poly1305_sse2_update(
888     crypto_onetimeauth_poly1305_state *state, const unsigned char *in,
889     unsigned long long inlen)
890 {
891     poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen);
892 
893     return 0;
894 }
895 
896 static int
897 crypto_onetimeauth_poly1305_sse2_final(crypto_onetimeauth_poly1305_state *state,
898                                        unsigned char *out)
899 {
900     poly1305_finish((poly1305_state_internal_t *) (void *) state, out);
901 
902     return 0;
903 }
904 
905 static int
906 crypto_onetimeauth_poly1305_sse2(unsigned char *out, const unsigned char *m,
907                                  unsigned long long   inlen,
908                                  const unsigned char *key)
909 {
910     CRYPTO_ALIGN(64) poly1305_state_internal_t st;
911     unsigned long long                         blocks;
912 
913     poly1305_init_ext(&st, key, inlen);
914     blocks = inlen & ~31;
915     if (blocks > 0) {
916         poly1305_blocks(&st, m, blocks);
917         m += blocks;
918         inlen -= blocks;
919     }
920     poly1305_finish_ext(&st, m, inlen, out);
921 
922     return 0;
923 }
924 
925 static int
926 crypto_onetimeauth_poly1305_sse2_verify(const unsigned char *h,
927                                         const unsigned char *in,
928                                         unsigned long long   inlen,
929                                         const unsigned char *k)
930 {
931     unsigned char correct[16];
932 
933     crypto_onetimeauth_poly1305_sse2(correct, in, inlen, k);
934 
935     return crypto_verify_16(h, correct);
936 }
937 
938 struct crypto_onetimeauth_poly1305_implementation
939     crypto_onetimeauth_poly1305_sse2_implementation = {
940         SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_sse2,
941         SODIUM_C99(.onetimeauth_verify =)
942             crypto_onetimeauth_poly1305_sse2_verify,
943         SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_sse2_init,
944         SODIUM_C99(.onetimeauth_update =)
945             crypto_onetimeauth_poly1305_sse2_update,
946         SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_sse2_final
947     };
948 
949 #endif
950