1 /* poly1305.c
2  *
3  * Copyright (C) 2006-2021 wolfSSL Inc.
4  *
5  * This file is part of wolfSSL.
6  *
7  * wolfSSL is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * wolfSSL is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
20  */
21 /*
22 
23 DESCRIPTION
24 This library contains implementation for the Poly1305 authenticator.
25 
26 Based off the public domain implementations by Andrew Moon
27 and Daniel J. Bernstein
28 
29 */
30 
31 
32 #ifdef HAVE_CONFIG_H
33     #include <config.h>
34 #endif
35 
36 #include <wolfssl/wolfcrypt/settings.h>
37 
38 #ifdef HAVE_POLY1305
39 #include <wolfssl/wolfcrypt/poly1305.h>
40 #include <wolfssl/wolfcrypt/error-crypt.h>
41 #include <wolfssl/wolfcrypt/logging.h>
42 #include <wolfssl/wolfcrypt/cpuid.h>
43 #ifdef NO_INLINE
44     #include <wolfssl/wolfcrypt/misc.h>
45 #else
46     #define WOLFSSL_MISC_INCLUDED
47     #include <wolfcrypt/src/misc.c>
48 #endif
49 #ifdef CHACHA_AEAD_TEST
50     #include <stdio.h>
51 #endif
52 
53 #ifdef _MSC_VER
54     /* 4127 warning constant while(1)  */
55     #pragma warning(disable: 4127)
56 #endif
57 
58 #ifdef USE_INTEL_SPEEDUP
59     #include <emmintrin.h>
60     #include <immintrin.h>
61 
62     #if defined(__GNUC__) && ((__GNUC__ < 4) || \
63                               (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
64         #undef  NO_AVX2_SUPPORT
65         #define NO_AVX2_SUPPORT
66     #endif
67     #if defined(__clang__) && ((__clang_major__ < 3) || \
68                                (__clang_major__ == 3 && __clang_minor__ <= 5))
69         #define NO_AVX2_SUPPORT
70     #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
71         #undef NO_AVX2_SUPPORT
72     #endif
73 
74     #define HAVE_INTEL_AVX1
75     #ifndef NO_AVX2_SUPPORT
76         #define HAVE_INTEL_AVX2
77     #endif
78 #endif
79 
80 #ifdef USE_INTEL_SPEEDUP
81 static word32 intel_flags = 0;
82 static word32 cpu_flags_set = 0;
83 #endif
84 
85 #if defined(USE_INTEL_SPEEDUP) || defined(POLY130564)
86     #if defined(_MSC_VER)
87         #define POLY1305_NOINLINE __declspec(noinline)
88     #elif defined(__GNUC__)
89         #define POLY1305_NOINLINE __attribute__((noinline))
90     #else
91         #define POLY1305_NOINLINE
92     #endif
93 
94     #if defined(_MSC_VER)
95         #include <intrin.h>
96 
97         typedef struct word128 {
98             word64 lo;
99             word64 hi;
100         } word128;
101 
102         #define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi)
103         #define ADD(out, in) { word64 t = out.lo; out.lo += in.lo; \
104                                out.hi += (out.lo < t) + in.hi; }
105         #define ADDLO(out, in) { word64 t = out.lo; out.lo += in; \
106                                  out.hi += (out.lo < t); }
107         #define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift)))
108         #define LO(in) (in.lo)
109 
110     #elif defined(__GNUC__)
111         #if defined(__SIZEOF_INT128__)
112             PEDANTIC_EXTENSION typedef unsigned __int128 word128;
113         #else
114             typedef unsigned word128 __attribute__((mode(TI)));
115         #endif
116 
117         #define MUL(out, x, y) out = ((word128)x * y)
118         #define ADD(out, in) out += in
119         #define ADDLO(out, in) out += in
120         #define SHR(in, shift) (word64)(in >> (shift))
121         #define LO(in) (word64)(in)
122     #endif
123 #endif
124 
125 #ifdef USE_INTEL_SPEEDUP
126 #ifdef __cplusplus
127     extern "C" {
128 #endif
129 
130 #ifdef HAVE_INTEL_AVX1
131 /* Process one block (16 bytes) of data.
132  *
133  * ctx  Poly1305 context.
134  * m    One block of message data.
135  */
136 extern void poly1305_block_avx(Poly1305* ctx, const unsigned char *m);
137 /* Process multiple blocks (n * 16 bytes) of data.
138  *
139  * ctx    Poly1305 context.
140  * m      Blocks of message data.
141  * bytes  The number of bytes to process.
142  */
143 extern void poly1305_blocks_avx(Poly1305* ctx, const unsigned char* m,
144                                 size_t bytes);
145 /* Set the key to use when processing data.
146  * Initialize the context.
147  *
148  * ctx  Poly1305 context.
149  * key  The key data (16 bytes).
150  */
151 extern void poly1305_setkey_avx(Poly1305* ctx, const byte* key);
152 /* Calculate the final result - authentication data.
153  * Zeros out the private data in the context.
154  *
155  * ctx  Poly1305 context.
156  * mac  Buffer to hold 16 bytes.
157  */
158 extern void poly1305_final_avx(Poly1305* ctx, byte* mac);
159 #endif
160 
161 #ifdef HAVE_INTEL_AVX2
162 /* Process multiple blocks (n * 16 bytes) of data.
163  *
164  * ctx    Poly1305 context.
165  * m      Blocks of message data.
166  * bytes  The number of bytes to process.
167  */
168 extern void poly1305_blocks_avx2(Poly1305* ctx, const unsigned char* m,
169                                  size_t bytes);
170 /* Calculate R^1, R^2, R^3 and R^4 and store them in the context.
171  *
172  * ctx    Poly1305 context.
173  */
174 extern void poly1305_calc_powers_avx2(Poly1305* ctx);
175 /* Set the key to use when processing data.
176  * Initialize the context.
177  * Calls AVX set key function as final function calls AVX code.
178  *
179  * ctx  Poly1305 context.
180  * key  The key data (16 bytes).
181  */
182 extern void poly1305_setkey_avx2(Poly1305* ctx, const byte* key);
183 /* Calculate the final result - authentication data.
184  * Zeros out the private data in the context.
185  * Calls AVX final function to quickly process last blocks.
186  *
187  * ctx  Poly1305 context.
188  * mac  Buffer to hold 16 bytes - authentication data.
189  */
190 extern void poly1305_final_avx2(Poly1305* ctx, byte* mac);
191 #endif
192 
193 #ifdef __cplusplus
194     }  /* extern "C" */
195 #endif
196 
197 #elif defined(POLY130564)
198 #ifndef WOLFSSL_ARMASM
U8TO64(const byte * p)199     static word64 U8TO64(const byte* p)
200     {
201         return
202             (((word64)(p[0] & 0xff)      ) |
203              ((word64)(p[1] & 0xff) <<  8) |
204              ((word64)(p[2] & 0xff) << 16) |
205              ((word64)(p[3] & 0xff) << 24) |
206              ((word64)(p[4] & 0xff) << 32) |
207              ((word64)(p[5] & 0xff) << 40) |
208              ((word64)(p[6] & 0xff) << 48) |
209              ((word64)(p[7] & 0xff) << 56));
210     }
211 
U64TO8(byte * p,word64 v)212     static void U64TO8(byte* p, word64 v) {
213         p[0] = (v      ) & 0xff;
214         p[1] = (v >>  8) & 0xff;
215         p[2] = (v >> 16) & 0xff;
216         p[3] = (v >> 24) & 0xff;
217         p[4] = (v >> 32) & 0xff;
218         p[5] = (v >> 40) & 0xff;
219         p[6] = (v >> 48) & 0xff;
220         p[7] = (v >> 56) & 0xff;
221     }
222 #endif/* WOLFSSL_ARMASM */
223 #else /* if not 64 bit then use 32 bit */
224 
U8TO32(const byte * p)225     static word32 U8TO32(const byte *p)
226     {
227         return
228             (((word32)(p[0] & 0xff)      ) |
229              ((word32)(p[1] & 0xff) <<  8) |
230              ((word32)(p[2] & 0xff) << 16) |
231              ((word32)(p[3] & 0xff) << 24));
232     }
233 
U32TO8(byte * p,word32 v)234     static void U32TO8(byte *p, word32 v) {
235         p[0] = (byte)((v      ) & 0xff);
236         p[1] = (byte)((v >>  8) & 0xff);
237         p[2] = (byte)((v >> 16) & 0xff);
238         p[3] = (byte)((v >> 24) & 0xff);
239     }
240 #endif
241 
242 /* convert 32-bit unsigned to little endian 64 bit type as byte array */
u32tole64(const word32 inLe32,byte outLe64[8])243 static WC_INLINE void u32tole64(const word32 inLe32, byte outLe64[8])
244 {
245 #ifndef WOLFSSL_X86_64_BUILD
246     outLe64[0] = (byte)(inLe32  & 0x000000FF);
247     outLe64[1] = (byte)((inLe32 & 0x0000FF00) >> 8);
248     outLe64[2] = (byte)((inLe32 & 0x00FF0000) >> 16);
249     outLe64[3] = (byte)((inLe32 & 0xFF000000) >> 24);
250     outLe64[4] = 0;
251     outLe64[5] = 0;
252     outLe64[6] = 0;
253     outLe64[7] = 0;
254 #else
255     *(word64*)outLe64 = inLe32;
256 #endif
257 }
258 
259 
260 #if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
261 /*
262 This local function operates on a message with a given number of bytes
263 with a given ctx pointer to a Poly1305 structure.
264 */
poly1305_blocks(Poly1305 * ctx,const unsigned char * m,size_t bytes)265 static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
266                      size_t bytes)
267 {
268 #ifdef USE_INTEL_SPEEDUP
269     /* AVX2 is handled in wc_Poly1305Update. */
270     SAVE_VECTOR_REGISTERS(return _svr_ret;);
271     poly1305_blocks_avx(ctx, m, bytes);
272     RESTORE_VECTOR_REGISTERS();
273     return 0;
274 #elif defined(POLY130564)
275     const word64 hibit = (ctx->finished) ? 0 : ((word64)1 << 40); /* 1 << 128 */
276     word64 r0,r1,r2;
277     word64 s1,s2;
278     word64 h0,h1,h2;
279     word64 c;
280     word128 d0,d1,d2,d;
281 
282     r0 = ctx->r[0];
283     r1 = ctx->r[1];
284     r2 = ctx->r[2];
285 
286     h0 = ctx->h[0];
287     h1 = ctx->h[1];
288     h2 = ctx->h[2];
289 
290     s1 = r1 * (5 << 2);
291     s2 = r2 * (5 << 2);
292 
293     while (bytes >= POLY1305_BLOCK_SIZE) {
294         word64 t0,t1;
295 
296         /* h += m[i] */
297         t0 = U8TO64(&m[0]);
298         t1 = U8TO64(&m[8]);
299 
300         h0 += (( t0                    ) & 0xfffffffffff);
301         h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
302         h2 += (((t1 >> 24)             ) & 0x3ffffffffff) | hibit;
303 
304         /* h *= r */
305         MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d);
306         MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d);
307         MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d);
308 
309         /* (partial) h %= p */
310                       c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff;
311         ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff;
312         ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff;
313         h0  += c * 5; c = (h0 >> 44);  h0 =    h0  & 0xfffffffffff;
314         h1  += c;
315 
316         m += POLY1305_BLOCK_SIZE;
317         bytes -= POLY1305_BLOCK_SIZE;
318     }
319 
320     ctx->h[0] = h0;
321     ctx->h[1] = h1;
322     ctx->h[2] = h2;
323 
324     return 0;
325 
326 #else /* if not 64 bit then use 32 bit */
327     const word32 hibit = (ctx->finished) ? 0 : ((word32)1 << 24); /* 1 << 128 */
328     word32 r0,r1,r2,r3,r4;
329     word32 s1,s2,s3,s4;
330     word32 h0,h1,h2,h3,h4;
331     word64 d0,d1,d2,d3,d4;
332     word32 c;
333 
334 
335     r0 = ctx->r[0];
336     r1 = ctx->r[1];
337     r2 = ctx->r[2];
338     r3 = ctx->r[3];
339     r4 = ctx->r[4];
340 
341     s1 = r1 * 5;
342     s2 = r2 * 5;
343     s3 = r3 * 5;
344     s4 = r4 * 5;
345 
346     h0 = ctx->h[0];
347     h1 = ctx->h[1];
348     h2 = ctx->h[2];
349     h3 = ctx->h[3];
350     h4 = ctx->h[4];
351 
352     while (bytes >= POLY1305_BLOCK_SIZE) {
353         /* h += m[i] */
354         h0 += (U8TO32(m+ 0)     ) & 0x3ffffff;
355         h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff;
356         h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff;
357         h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff;
358         h4 += (U8TO32(m+12) >> 8) | hibit;
359 
360         /* h *= r */
361         d0 = ((word64)h0 * r0) + ((word64)h1 * s4) + ((word64)h2 * s3) +
362              ((word64)h3 * s2) + ((word64)h4 * s1);
363         d1 = ((word64)h0 * r1) + ((word64)h1 * r0) + ((word64)h2 * s4) +
364              ((word64)h3 * s3) + ((word64)h4 * s2);
365         d2 = ((word64)h0 * r2) + ((word64)h1 * r1) + ((word64)h2 * r0) +
366              ((word64)h3 * s4) + ((word64)h4 * s3);
367         d3 = ((word64)h0 * r3) + ((word64)h1 * r2) + ((word64)h2 * r1) +
368              ((word64)h3 * r0) + ((word64)h4 * s4);
369         d4 = ((word64)h0 * r4) + ((word64)h1 * r3) + ((word64)h2 * r2) +
370              ((word64)h3 * r1) + ((word64)h4 * r0);
371 
372         /* (partial) h %= p */
373                       c = (word32)(d0 >> 26); h0 = (word32)d0 & 0x3ffffff;
374         d1 += c;      c = (word32)(d1 >> 26); h1 = (word32)d1 & 0x3ffffff;
375         d2 += c;      c = (word32)(d2 >> 26); h2 = (word32)d2 & 0x3ffffff;
376         d3 += c;      c = (word32)(d3 >> 26); h3 = (word32)d3 & 0x3ffffff;
377         d4 += c;      c = (word32)(d4 >> 26); h4 = (word32)d4 & 0x3ffffff;
378         h0 += c * 5;  c =  (h0 >> 26); h0 =                h0 & 0x3ffffff;
379         h1 += c;
380 
381         m += POLY1305_BLOCK_SIZE;
382         bytes -= POLY1305_BLOCK_SIZE;
383     }
384 
385     ctx->h[0] = h0;
386     ctx->h[1] = h1;
387     ctx->h[2] = h2;
388     ctx->h[3] = h3;
389     ctx->h[4] = h4;
390 
391     return 0;
392 
393 #endif /* end of 64 bit cpu blocks or 32 bit cpu */
394 }
395 
396 /*
397 This local function is used for the last call when a message with a given
398 number of bytes is less than the block size.
399 */
poly1305_block(Poly1305 * ctx,const unsigned char * m)400 static int poly1305_block(Poly1305* ctx, const unsigned char *m)
401 {
402 #ifdef USE_INTEL_SPEEDUP
403     /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
404     SAVE_VECTOR_REGISTERS(return _svr_ret;);
405     poly1305_block_avx(ctx, m);
406     RESTORE_VECTOR_REGISTERS();
407     return 0;
408 #else
409     return poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE);
410 #endif
411 }
412 #endif /* !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) */
413 
414 #if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
wc_Poly1305SetKey(Poly1305 * ctx,const byte * key,word32 keySz)415 int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
416 {
417 #if defined(POLY130564) && !defined(USE_INTEL_SPEEDUP)
418     word64 t0,t1;
419 #endif
420 
421     if (key == NULL)
422         return BAD_FUNC_ARG;
423 
424 #ifdef CHACHA_AEAD_TEST
425     word32 k;
426     printf("Poly key used:\n");
427     for (k = 0; k < keySz; k++) {
428         printf("%02x", key[k]);
429         if ((k+1) % 8 == 0)
430             printf("\n");
431     }
432     printf("\n");
433 #endif
434 
435     if (keySz != 32 || ctx == NULL)
436         return BAD_FUNC_ARG;
437 
438 #ifdef USE_INTEL_SPEEDUP
439     if (!cpu_flags_set) {
440         intel_flags = cpuid_get_flags();
441         cpu_flags_set = 1;
442     }
443     SAVE_VECTOR_REGISTERS(return _svr_ret;);
444     #ifdef HAVE_INTEL_AVX2
445     if (IS_INTEL_AVX2(intel_flags))
446         poly1305_setkey_avx2(ctx, key);
447     else
448     #endif
449         poly1305_setkey_avx(ctx, key);
450     RESTORE_VECTOR_REGISTERS();
451 #elif defined(POLY130564)
452 
453     /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
454     t0 = U8TO64(key + 0);
455     t1 = U8TO64(key + 8);
456 
457     ctx->r[0] = ( t0                    ) & 0xffc0fffffff;
458     ctx->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
459     ctx->r[2] = ((t1 >> 24)             ) & 0x00ffffffc0f;
460 
461     /* h (accumulator) = 0 */
462     ctx->h[0] = 0;
463     ctx->h[1] = 0;
464     ctx->h[2] = 0;
465 
466     /* save pad for later */
467     ctx->pad[0] = U8TO64(key + 16);
468     ctx->pad[1] = U8TO64(key + 24);
469 
470     ctx->leftover = 0;
471     ctx->finished = 0;
472 
473 #else /* if not 64 bit then use 32 bit */
474 
475     /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
476     ctx->r[0] = (U8TO32(key +  0)     ) & 0x3ffffff;
477     ctx->r[1] = (U8TO32(key +  3) >> 2) & 0x3ffff03;
478     ctx->r[2] = (U8TO32(key +  6) >> 4) & 0x3ffc0ff;
479     ctx->r[3] = (U8TO32(key +  9) >> 6) & 0x3f03fff;
480     ctx->r[4] = (U8TO32(key + 12) >> 8) & 0x00fffff;
481 
482     /* h = 0 */
483     ctx->h[0] = 0;
484     ctx->h[1] = 0;
485     ctx->h[2] = 0;
486     ctx->h[3] = 0;
487     ctx->h[4] = 0;
488 
489     /* save pad for later */
490     ctx->pad[0] = U8TO32(key + 16);
491     ctx->pad[1] = U8TO32(key + 20);
492     ctx->pad[2] = U8TO32(key + 24);
493     ctx->pad[3] = U8TO32(key + 28);
494 
495     ctx->leftover = 0;
496     ctx->finished = 0;
497 
498 #endif
499 
500     return 0;
501 }
502 
wc_Poly1305Final(Poly1305 * ctx,byte * mac)503 int wc_Poly1305Final(Poly1305* ctx, byte* mac)
504 {
505 #ifdef USE_INTEL_SPEEDUP
506 #elif defined(POLY130564)
507 
508     word64 h0,h1,h2,c;
509     word64 g0,g1,g2;
510     word64 t0,t1;
511 
512 #else
513 
514     word32 h0,h1,h2,h3,h4,c;
515     word32 g0,g1,g2,g3,g4;
516     word64 f;
517     word32 mask;
518 
519 #endif
520 
521     if (ctx == NULL || mac == NULL)
522         return BAD_FUNC_ARG;
523 
524 #ifdef USE_INTEL_SPEEDUP
525     SAVE_VECTOR_REGISTERS(return _svr_ret;);
526     #ifdef HAVE_INTEL_AVX2
527     if (IS_INTEL_AVX2(intel_flags))
528         poly1305_final_avx2(ctx, mac);
529     else
530     #endif
531         poly1305_final_avx(ctx, mac);
532     RESTORE_VECTOR_REGISTERS();
533 #elif defined(POLY130564)
534 
535     /* process the remaining block */
536     if (ctx->leftover) {
537         size_t i = ctx->leftover;
538         ctx->buffer[i] = 1;
539         for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++)
540             ctx->buffer[i] = 0;
541         ctx->finished = 1;
542         poly1305_block(ctx, ctx->buffer);
543     }
544 
545     /* fully carry h */
546     h0 = ctx->h[0];
547     h1 = ctx->h[1];
548     h2 = ctx->h[2];
549 
550                  c = (h1 >> 44); h1 &= 0xfffffffffff;
551     h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
552     h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
553     h1 += c;     c = (h1 >> 44); h1 &= 0xfffffffffff;
554     h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
555     h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
556     h1 += c;
557 
558     /* compute h + -p */
559     g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
560     g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
561     g2 = h2 + c - ((word64)1 << 42);
562 
563     /* select h if h < p, or h + -p if h >= p */
564     c = (g2 >> ((sizeof(word64) * 8) - 1)) - 1;
565     g0 &= c;
566     g1 &= c;
567     g2 &= c;
568     c = ~c;
569     h0 = (h0 & c) | g0;
570     h1 = (h1 & c) | g1;
571     h2 = (h2 & c) | g2;
572 
573     /* h = (h + pad) */
574     t0 = ctx->pad[0];
575     t1 = ctx->pad[1];
576 
577     h0 += (( t0                    ) & 0xfffffffffff)    ;
578     c = (h0 >> 44); h0 &= 0xfffffffffff;
579     h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
580     c = (h1 >> 44); h1 &= 0xfffffffffff;
581     h2 += (((t1 >> 24)             ) & 0x3ffffffffff) + c;
582     h2 &= 0x3ffffffffff;
583 
584     /* mac = h % (2^128) */
585     h0 = ((h0      ) | (h1 << 44));
586     h1 = ((h1 >> 20) | (h2 << 24));
587 
588     U64TO8(mac + 0, h0);
589     U64TO8(mac + 8, h1);
590 
591     /* zero out the state */
592     ctx->h[0] = 0;
593     ctx->h[1] = 0;
594     ctx->h[2] = 0;
595     ctx->r[0] = 0;
596     ctx->r[1] = 0;
597     ctx->r[2] = 0;
598     ctx->pad[0] = 0;
599     ctx->pad[1] = 0;
600 
601 #else /* if not 64 bit then use 32 bit */
602 
603     /* process the remaining block */
604     if (ctx->leftover) {
605         size_t i = ctx->leftover;
606         ctx->buffer[i++] = 1;
607         for (; i < POLY1305_BLOCK_SIZE; i++)
608             ctx->buffer[i] = 0;
609         ctx->finished = 1;
610         poly1305_block(ctx, ctx->buffer);
611     }
612 
613     /* fully carry h */
614     h0 = ctx->h[0];
615     h1 = ctx->h[1];
616     h2 = ctx->h[2];
617     h3 = ctx->h[3];
618     h4 = ctx->h[4];
619 
620                  c = h1 >> 26; h1 = h1 & 0x3ffffff;
621     h2 +=     c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
622     h3 +=     c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
623     h4 +=     c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
624     h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
625     h1 +=     c;
626 
627     /* compute h + -p */
628     g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
629     g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
630     g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
631     g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
632     g4 = h4 + c - ((word32)1 << 26);
633 
634     /* select h if h < p, or h + -p if h >= p */
635     mask = ((word32)g4 >> ((sizeof(word32) * 8) - 1)) - 1;
636     g0 &= mask;
637     g1 &= mask;
638     g2 &= mask;
639     g3 &= mask;
640     g4 &= mask;
641     mask = ~mask;
642     h0 = (h0 & mask) | g0;
643     h1 = (h1 & mask) | g1;
644     h2 = (h2 & mask) | g2;
645     h3 = (h3 & mask) | g3;
646     h4 = (h4 & mask) | g4;
647 
648     /* h = h % (2^128) */
649     h0 = ((h0      ) | (h1 << 26)) & 0xffffffff;
650     h1 = ((h1 >>  6) | (h2 << 20)) & 0xffffffff;
651     h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
652     h3 = ((h3 >> 18) | (h4 <<  8)) & 0xffffffff;
653 
654     /* mac = (h + pad) % (2^128) */
655     f = (word64)h0 + ctx->pad[0]            ; h0 = (word32)f;
656     f = (word64)h1 + ctx->pad[1] + (f >> 32); h1 = (word32)f;
657     f = (word64)h2 + ctx->pad[2] + (f >> 32); h2 = (word32)f;
658     f = (word64)h3 + ctx->pad[3] + (f >> 32); h3 = (word32)f;
659 
660     U32TO8(mac + 0, h0);
661     U32TO8(mac + 4, h1);
662     U32TO8(mac + 8, h2);
663     U32TO8(mac + 12, h3);
664 
665     /* zero out the state */
666     ctx->h[0] = 0;
667     ctx->h[1] = 0;
668     ctx->h[2] = 0;
669     ctx->h[3] = 0;
670     ctx->h[4] = 0;
671     ctx->r[0] = 0;
672     ctx->r[1] = 0;
673     ctx->r[2] = 0;
674     ctx->r[3] = 0;
675     ctx->r[4] = 0;
676     ctx->pad[0] = 0;
677     ctx->pad[1] = 0;
678     ctx->pad[2] = 0;
679     ctx->pad[3] = 0;
680 
681 #endif
682 
683     return 0;
684 }
685 #endif /* !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) */
686 
687 
wc_Poly1305Update(Poly1305 * ctx,const byte * m,word32 bytes)688 int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
689 {
690     size_t i;
691 
692     if (ctx == NULL || (m == NULL && bytes > 0))
693         return BAD_FUNC_ARG;
694 
695     if (bytes == 0) {
696         /* valid, but do nothing */
697         return 0;
698     }
699 #ifdef CHACHA_AEAD_TEST
700     word32 k;
701     printf("Raw input to poly:\n");
702     for (k = 0; k < bytes; k++) {
703         printf("%02x", m[k]);
704         if ((k+1) % 16 == 0)
705             printf("\n");
706     }
707     printf("\n");
708 #endif
709 
710 #ifdef USE_INTEL_SPEEDUP
711     #ifdef HAVE_INTEL_AVX2
712     if (IS_INTEL_AVX2(intel_flags)) {
713         SAVE_VECTOR_REGISTERS(return _svr_ret;);
714 
715         /* handle leftover */
716 
717         if (ctx->leftover) {
718             size_t want = sizeof(ctx->buffer) - ctx->leftover;
719             if (want > bytes)
720                 want = bytes;
721 
722             for (i = 0; i < want; i++)
723                 ctx->buffer[ctx->leftover + i] = m[i];
724             bytes -= (word32)want;
725             m += want;
726             ctx->leftover += want;
727             if (ctx->leftover < sizeof(ctx->buffer)) {
728                 RESTORE_VECTOR_REGISTERS();
729                 return 0;
730             }
731 
732             if (!ctx->started)
733                 poly1305_calc_powers_avx2(ctx);
734             poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
735             ctx->leftover = 0;
736         }
737 
738         /* process full blocks */
739         if (bytes >= sizeof(ctx->buffer)) {
740             size_t want = bytes & ~(sizeof(ctx->buffer) - 1);
741 
742             if (!ctx->started)
743                 poly1305_calc_powers_avx2(ctx);
744             poly1305_blocks_avx2(ctx, m, want);
745             m += want;
746             bytes -= (word32)want;
747         }
748 
749         /* store leftover */
750         if (bytes) {
751             for (i = 0; i < bytes; i++)
752                 ctx->buffer[ctx->leftover + i] = m[i];
753             ctx->leftover += bytes;
754         }
755         RESTORE_VECTOR_REGISTERS();
756     }
757     else
758     #endif
759 #endif
760     {
761         /* handle leftover */
762         if (ctx->leftover) {
763             size_t want = (POLY1305_BLOCK_SIZE - ctx->leftover);
764             if (want > bytes)
765                 want = bytes;
766             for (i = 0; i < want; i++)
767                 ctx->buffer[ctx->leftover + i] = m[i];
768             bytes -= (word32)want;
769             m += want;
770             ctx->leftover += want;
771             if (ctx->leftover < POLY1305_BLOCK_SIZE)
772                 return 0;
773             poly1305_block(ctx, ctx->buffer);
774             ctx->leftover = 0;
775         }
776 
777         /* process full blocks */
778         if (bytes >= POLY1305_BLOCK_SIZE) {
779             size_t want = (bytes & ~(POLY1305_BLOCK_SIZE - 1));
780 #if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
781             int ret;
782             ret = poly1305_blocks(ctx, m, want);
783             if (ret != 0)
784                 return ret;
785 #else
786             poly1305_blocks(ctx, m, want);
787 #endif
788             m += want;
789             bytes -= (word32)want;
790         }
791 
792         /* store leftover */
793         if (bytes) {
794             for (i = 0; i < bytes; i++)
795                 ctx->buffer[ctx->leftover + i] = m[i];
796             ctx->leftover += bytes;
797         }
798     }
799 
800     return 0;
801 }
802 
803 /*  Takes a Poly1305 struct that has a key loaded and pads the provided length
804     ctx        : Initialized Poly1305 struct to use
805     lenToPad   : Current number of bytes updated that needs padding to 16
806  */
wc_Poly1305_Pad(Poly1305 * ctx,word32 lenToPad)807 int wc_Poly1305_Pad(Poly1305* ctx, word32 lenToPad)
808 {
809     int ret = 0;
810     word32 paddingLen;
811     byte padding[WC_POLY1305_PAD_SZ - 1];
812 
813     if (ctx == NULL) {
814         return BAD_FUNC_ARG;
815     }
816     if (lenToPad == 0) {
817         return 0; /* nothing needs to be done */
818     }
819 
820     XMEMSET(padding, 0, sizeof(padding));
821 
822     /* Pad length to 16 bytes */
823     paddingLen = (-(int)lenToPad) & (WC_POLY1305_PAD_SZ - 1);
824     if ((paddingLen > 0) && (paddingLen < WC_POLY1305_PAD_SZ)) {
825         ret = wc_Poly1305Update(ctx, padding, paddingLen);
826     }
827     return ret;
828 }
829 
830 /*  Takes a Poly1305 struct that has a key loaded and adds the AEAD length
831     encoding in 64-bit little endian
832     aadSz      : Size of the additional authentication data
833     dataSz     : Size of the plaintext or ciphertext
834  */
wc_Poly1305_EncodeSizes(Poly1305 * ctx,word32 aadSz,word32 dataSz)835 int wc_Poly1305_EncodeSizes(Poly1305* ctx, word32 aadSz, word32 dataSz)
836 {
837     int ret;
838     byte little64[16]; /* sizeof(word64) * 2 */
839 
840     if (ctx == NULL) {
841         return BAD_FUNC_ARG;
842     }
843 
844     XMEMSET(little64, 0, sizeof(little64));
845 
846     /* size of additional data and input data as little endian 64 bit types */
847     u32tole64(aadSz,  little64);
848     u32tole64(dataSz, little64 + 8);
849     ret = wc_Poly1305Update(ctx, little64, sizeof(little64));
850 
851     return ret;
852 }
853 
854 #ifdef WORD64_AVAILABLE
wc_Poly1305_EncodeSizes64(Poly1305 * ctx,word64 aadSz,word64 dataSz)855 int wc_Poly1305_EncodeSizes64(Poly1305* ctx, word64 aadSz, word64 dataSz)
856 {
857     int ret;
858     word64 little64[2];
859 
860     if (ctx == NULL) {
861         return BAD_FUNC_ARG;
862     }
863 
864 #ifdef BIG_ENDIAN_ORDER
865     little64[0] = ByteReverseWord64(aadSz);
866     little64[1] = ByteReverseWord64(dataSz);
867 #else
868     little64[0] = aadSz;
869     little64[1] = dataSz;
870 #endif
871 
872     ret = wc_Poly1305Update(ctx, (byte *)little64, sizeof(little64));
873 
874     return ret;
875 }
876 #endif
877 
878 /*  Takes in an initialized Poly1305 struct that has a key loaded and creates
879     a MAC (tag) using recent TLS AEAD padding scheme.
880     ctx        : Initialized Poly1305 struct to use
881     additional : Additional data to use
882     addSz      : Size of additional buffer
883     input      : Input buffer to create tag from
884     sz         : Size of input buffer
885     tag        : Buffer to hold created tag
886     tagSz      : Size of input tag buffer (must be at least
887                  WC_POLY1305_MAC_SZ(16))
888  */
wc_Poly1305_MAC(Poly1305 * ctx,const byte * additional,word32 addSz,const byte * input,word32 sz,byte * tag,word32 tagSz)889 int wc_Poly1305_MAC(Poly1305* ctx, const byte* additional, word32 addSz,
890                     const byte* input, word32 sz, byte* tag, word32 tagSz)
891 {
892     int ret;
893 
894     /* sanity check on arguments */
895     if (ctx == NULL || input == NULL || tag == NULL ||
896                                                    tagSz < WC_POLY1305_MAC_SZ) {
897         return BAD_FUNC_ARG;
898     }
899 
900     /* additional allowed to be 0 */
901     if (addSz > 0) {
902         if (additional == NULL)
903             return BAD_FUNC_ARG;
904 
905         /* additional data plus padding */
906         if ((ret = wc_Poly1305Update(ctx, additional, addSz)) != 0) {
907             return ret;
908         }
909         /* pad additional data */
910         if ((ret = wc_Poly1305_Pad(ctx, addSz)) != 0) {
911             return ret;
912         }
913     }
914 
915     /* input plus padding */
916     if ((ret = wc_Poly1305Update(ctx, input, sz)) != 0) {
917         return ret;
918     }
919     /* pad input data */
920     if ((ret = wc_Poly1305_Pad(ctx, sz)) != 0) {
921         return ret;
922     }
923 
924     /* encode size of AAD and input data as little endian 64 bit types */
925     if ((ret = wc_Poly1305_EncodeSizes(ctx, addSz, sz)) != 0) {
926         return ret;
927     }
928 
929     /* Finalize the auth tag */
930     ret = wc_Poly1305Final(ctx, tag);
931 
932     return ret;
933 
934 }
935 #endif /* HAVE_POLY1305 */
936