1 /* poly1305.c
2 *
3 * Copyright (C) 2006-2021 wolfSSL Inc.
4 *
5 * This file is part of wolfSSL.
6 *
7 * wolfSSL is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * wolfSSL is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
20 */
21 /*
22
23 DESCRIPTION
24 This library contains implementation for the Poly1305 authenticator.
25
26 Based off the public domain implementations by Andrew Moon
27 and Daniel J. Bernstein
28
29 */
30
31
32 #ifdef HAVE_CONFIG_H
33 #include <config.h>
34 #endif
35
36 #include <wolfssl/wolfcrypt/settings.h>
37
38 #ifdef HAVE_POLY1305
39 #include <wolfssl/wolfcrypt/poly1305.h>
40 #include <wolfssl/wolfcrypt/error-crypt.h>
41 #include <wolfssl/wolfcrypt/logging.h>
42 #include <wolfssl/wolfcrypt/cpuid.h>
43 #ifdef NO_INLINE
44 #include <wolfssl/wolfcrypt/misc.h>
45 #else
46 #define WOLFSSL_MISC_INCLUDED
47 #include <wolfcrypt/src/misc.c>
48 #endif
49 #ifdef CHACHA_AEAD_TEST
50 #include <stdio.h>
51 #endif
52
53 #ifdef _MSC_VER
54 /* 4127 warning constant while(1) */
55 #pragma warning(disable: 4127)
56 #endif
57
58 #ifdef USE_INTEL_SPEEDUP
59 #include <emmintrin.h>
60 #include <immintrin.h>
61
62 #if defined(__GNUC__) && ((__GNUC__ < 4) || \
63 (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
64 #undef NO_AVX2_SUPPORT
65 #define NO_AVX2_SUPPORT
66 #endif
67 #if defined(__clang__) && ((__clang_major__ < 3) || \
68 (__clang_major__ == 3 && __clang_minor__ <= 5))
69 #define NO_AVX2_SUPPORT
70 #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
71 #undef NO_AVX2_SUPPORT
72 #endif
73
74 #define HAVE_INTEL_AVX1
75 #ifndef NO_AVX2_SUPPORT
76 #define HAVE_INTEL_AVX2
77 #endif
78 #endif
79
80 #ifdef USE_INTEL_SPEEDUP
81 static word32 intel_flags = 0;
82 static word32 cpu_flags_set = 0;
83 #endif
84
85 #if defined(USE_INTEL_SPEEDUP) || defined(POLY130564)
86 #if defined(_MSC_VER)
87 #define POLY1305_NOINLINE __declspec(noinline)
88 #elif defined(__GNUC__)
89 #define POLY1305_NOINLINE __attribute__((noinline))
90 #else
91 #define POLY1305_NOINLINE
92 #endif
93
94 #if defined(_MSC_VER)
95 #include <intrin.h>
96
97 typedef struct word128 {
98 word64 lo;
99 word64 hi;
100 } word128;
101
102 #define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi)
103 #define ADD(out, in) { word64 t = out.lo; out.lo += in.lo; \
104 out.hi += (out.lo < t) + in.hi; }
105 #define ADDLO(out, in) { word64 t = out.lo; out.lo += in; \
106 out.hi += (out.lo < t); }
107 #define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift)))
108 #define LO(in) (in.lo)
109
110 #elif defined(__GNUC__)
111 #if defined(__SIZEOF_INT128__)
112 PEDANTIC_EXTENSION typedef unsigned __int128 word128;
113 #else
114 typedef unsigned word128 __attribute__((mode(TI)));
115 #endif
116
117 #define MUL(out, x, y) out = ((word128)x * y)
118 #define ADD(out, in) out += in
119 #define ADDLO(out, in) out += in
120 #define SHR(in, shift) (word64)(in >> (shift))
121 #define LO(in) (word64)(in)
122 #endif
123 #endif
124
125 #ifdef USE_INTEL_SPEEDUP
126 #ifdef __cplusplus
127 extern "C" {
128 #endif
129
130 #ifdef HAVE_INTEL_AVX1
131 /* Process one block (16 bytes) of data.
132 *
133 * ctx Poly1305 context.
134 * m One block of message data.
135 */
136 extern void poly1305_block_avx(Poly1305* ctx, const unsigned char *m);
137 /* Process multiple blocks (n * 16 bytes) of data.
138 *
139 * ctx Poly1305 context.
140 * m Blocks of message data.
141 * bytes The number of bytes to process.
142 */
143 extern void poly1305_blocks_avx(Poly1305* ctx, const unsigned char* m,
144 size_t bytes);
145 /* Set the key to use when processing data.
146 * Initialize the context.
147 *
148 * ctx Poly1305 context.
149 * key The key data (16 bytes).
150 */
151 extern void poly1305_setkey_avx(Poly1305* ctx, const byte* key);
152 /* Calculate the final result - authentication data.
153 * Zeros out the private data in the context.
154 *
155 * ctx Poly1305 context.
156 * mac Buffer to hold 16 bytes.
157 */
158 extern void poly1305_final_avx(Poly1305* ctx, byte* mac);
159 #endif
160
161 #ifdef HAVE_INTEL_AVX2
162 /* Process multiple blocks (n * 16 bytes) of data.
163 *
164 * ctx Poly1305 context.
165 * m Blocks of message data.
166 * bytes The number of bytes to process.
167 */
168 extern void poly1305_blocks_avx2(Poly1305* ctx, const unsigned char* m,
169 size_t bytes);
170 /* Calculate R^1, R^2, R^3 and R^4 and store them in the context.
171 *
172 * ctx Poly1305 context.
173 */
174 extern void poly1305_calc_powers_avx2(Poly1305* ctx);
175 /* Set the key to use when processing data.
176 * Initialize the context.
177 * Calls AVX set key function as final function calls AVX code.
178 *
179 * ctx Poly1305 context.
180 * key The key data (16 bytes).
181 */
182 extern void poly1305_setkey_avx2(Poly1305* ctx, const byte* key);
183 /* Calculate the final result - authentication data.
184 * Zeros out the private data in the context.
185 * Calls AVX final function to quickly process last blocks.
186 *
187 * ctx Poly1305 context.
188 * mac Buffer to hold 16 bytes - authentication data.
189 */
190 extern void poly1305_final_avx2(Poly1305* ctx, byte* mac);
191 #endif
192
193 #ifdef __cplusplus
194 } /* extern "C" */
195 #endif
196
197 #elif defined(POLY130564)
198 #ifndef WOLFSSL_ARMASM
U8TO64(const byte * p)199 static word64 U8TO64(const byte* p)
200 {
201 return
202 (((word64)(p[0] & 0xff) ) |
203 ((word64)(p[1] & 0xff) << 8) |
204 ((word64)(p[2] & 0xff) << 16) |
205 ((word64)(p[3] & 0xff) << 24) |
206 ((word64)(p[4] & 0xff) << 32) |
207 ((word64)(p[5] & 0xff) << 40) |
208 ((word64)(p[6] & 0xff) << 48) |
209 ((word64)(p[7] & 0xff) << 56));
210 }
211
U64TO8(byte * p,word64 v)212 static void U64TO8(byte* p, word64 v) {
213 p[0] = (v ) & 0xff;
214 p[1] = (v >> 8) & 0xff;
215 p[2] = (v >> 16) & 0xff;
216 p[3] = (v >> 24) & 0xff;
217 p[4] = (v >> 32) & 0xff;
218 p[5] = (v >> 40) & 0xff;
219 p[6] = (v >> 48) & 0xff;
220 p[7] = (v >> 56) & 0xff;
221 }
222 #endif/* WOLFSSL_ARMASM */
223 #else /* if not 64 bit then use 32 bit */
224
U8TO32(const byte * p)225 static word32 U8TO32(const byte *p)
226 {
227 return
228 (((word32)(p[0] & 0xff) ) |
229 ((word32)(p[1] & 0xff) << 8) |
230 ((word32)(p[2] & 0xff) << 16) |
231 ((word32)(p[3] & 0xff) << 24));
232 }
233
U32TO8(byte * p,word32 v)234 static void U32TO8(byte *p, word32 v) {
235 p[0] = (byte)((v ) & 0xff);
236 p[1] = (byte)((v >> 8) & 0xff);
237 p[2] = (byte)((v >> 16) & 0xff);
238 p[3] = (byte)((v >> 24) & 0xff);
239 }
240 #endif
241
242 /* convert 32-bit unsigned to little endian 64 bit type as byte array */
u32tole64(const word32 inLe32,byte outLe64[8])243 static WC_INLINE void u32tole64(const word32 inLe32, byte outLe64[8])
244 {
245 #ifndef WOLFSSL_X86_64_BUILD
246 outLe64[0] = (byte)(inLe32 & 0x000000FF);
247 outLe64[1] = (byte)((inLe32 & 0x0000FF00) >> 8);
248 outLe64[2] = (byte)((inLe32 & 0x00FF0000) >> 16);
249 outLe64[3] = (byte)((inLe32 & 0xFF000000) >> 24);
250 outLe64[4] = 0;
251 outLe64[5] = 0;
252 outLe64[6] = 0;
253 outLe64[7] = 0;
254 #else
255 *(word64*)outLe64 = inLe32;
256 #endif
257 }
258
259
260 #if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
261 /*
262 This local function operates on a message with a given number of bytes
263 with a given ctx pointer to a Poly1305 structure.
264 */
poly1305_blocks(Poly1305 * ctx,const unsigned char * m,size_t bytes)265 static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
266 size_t bytes)
267 {
268 #ifdef USE_INTEL_SPEEDUP
269 /* AVX2 is handled in wc_Poly1305Update. */
270 SAVE_VECTOR_REGISTERS(return _svr_ret;);
271 poly1305_blocks_avx(ctx, m, bytes);
272 RESTORE_VECTOR_REGISTERS();
273 return 0;
274 #elif defined(POLY130564)
275 const word64 hibit = (ctx->finished) ? 0 : ((word64)1 << 40); /* 1 << 128 */
276 word64 r0,r1,r2;
277 word64 s1,s2;
278 word64 h0,h1,h2;
279 word64 c;
280 word128 d0,d1,d2,d;
281
282 r0 = ctx->r[0];
283 r1 = ctx->r[1];
284 r2 = ctx->r[2];
285
286 h0 = ctx->h[0];
287 h1 = ctx->h[1];
288 h2 = ctx->h[2];
289
290 s1 = r1 * (5 << 2);
291 s2 = r2 * (5 << 2);
292
293 while (bytes >= POLY1305_BLOCK_SIZE) {
294 word64 t0,t1;
295
296 /* h += m[i] */
297 t0 = U8TO64(&m[0]);
298 t1 = U8TO64(&m[8]);
299
300 h0 += (( t0 ) & 0xfffffffffff);
301 h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
302 h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit;
303
304 /* h *= r */
305 MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d);
306 MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d);
307 MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d);
308
309 /* (partial) h %= p */
310 c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff;
311 ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff;
312 ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff;
313 h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff;
314 h1 += c;
315
316 m += POLY1305_BLOCK_SIZE;
317 bytes -= POLY1305_BLOCK_SIZE;
318 }
319
320 ctx->h[0] = h0;
321 ctx->h[1] = h1;
322 ctx->h[2] = h2;
323
324 return 0;
325
326 #else /* if not 64 bit then use 32 bit */
327 const word32 hibit = (ctx->finished) ? 0 : ((word32)1 << 24); /* 1 << 128 */
328 word32 r0,r1,r2,r3,r4;
329 word32 s1,s2,s3,s4;
330 word32 h0,h1,h2,h3,h4;
331 word64 d0,d1,d2,d3,d4;
332 word32 c;
333
334
335 r0 = ctx->r[0];
336 r1 = ctx->r[1];
337 r2 = ctx->r[2];
338 r3 = ctx->r[3];
339 r4 = ctx->r[4];
340
341 s1 = r1 * 5;
342 s2 = r2 * 5;
343 s3 = r3 * 5;
344 s4 = r4 * 5;
345
346 h0 = ctx->h[0];
347 h1 = ctx->h[1];
348 h2 = ctx->h[2];
349 h3 = ctx->h[3];
350 h4 = ctx->h[4];
351
352 while (bytes >= POLY1305_BLOCK_SIZE) {
353 /* h += m[i] */
354 h0 += (U8TO32(m+ 0) ) & 0x3ffffff;
355 h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff;
356 h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff;
357 h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff;
358 h4 += (U8TO32(m+12) >> 8) | hibit;
359
360 /* h *= r */
361 d0 = ((word64)h0 * r0) + ((word64)h1 * s4) + ((word64)h2 * s3) +
362 ((word64)h3 * s2) + ((word64)h4 * s1);
363 d1 = ((word64)h0 * r1) + ((word64)h1 * r0) + ((word64)h2 * s4) +
364 ((word64)h3 * s3) + ((word64)h4 * s2);
365 d2 = ((word64)h0 * r2) + ((word64)h1 * r1) + ((word64)h2 * r0) +
366 ((word64)h3 * s4) + ((word64)h4 * s3);
367 d3 = ((word64)h0 * r3) + ((word64)h1 * r2) + ((word64)h2 * r1) +
368 ((word64)h3 * r0) + ((word64)h4 * s4);
369 d4 = ((word64)h0 * r4) + ((word64)h1 * r3) + ((word64)h2 * r2) +
370 ((word64)h3 * r1) + ((word64)h4 * r0);
371
372 /* (partial) h %= p */
373 c = (word32)(d0 >> 26); h0 = (word32)d0 & 0x3ffffff;
374 d1 += c; c = (word32)(d1 >> 26); h1 = (word32)d1 & 0x3ffffff;
375 d2 += c; c = (word32)(d2 >> 26); h2 = (word32)d2 & 0x3ffffff;
376 d3 += c; c = (word32)(d3 >> 26); h3 = (word32)d3 & 0x3ffffff;
377 d4 += c; c = (word32)(d4 >> 26); h4 = (word32)d4 & 0x3ffffff;
378 h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff;
379 h1 += c;
380
381 m += POLY1305_BLOCK_SIZE;
382 bytes -= POLY1305_BLOCK_SIZE;
383 }
384
385 ctx->h[0] = h0;
386 ctx->h[1] = h1;
387 ctx->h[2] = h2;
388 ctx->h[3] = h3;
389 ctx->h[4] = h4;
390
391 return 0;
392
393 #endif /* end of 64 bit cpu blocks or 32 bit cpu */
394 }
395
396 /*
397 This local function is used for the last call when a message with a given
398 number of bytes is less than the block size.
399 */
poly1305_block(Poly1305 * ctx,const unsigned char * m)400 static int poly1305_block(Poly1305* ctx, const unsigned char *m)
401 {
402 #ifdef USE_INTEL_SPEEDUP
403 /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
404 SAVE_VECTOR_REGISTERS(return _svr_ret;);
405 poly1305_block_avx(ctx, m);
406 RESTORE_VECTOR_REGISTERS();
407 return 0;
408 #else
409 return poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE);
410 #endif
411 }
412 #endif /* !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) */
413
414 #if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
wc_Poly1305SetKey(Poly1305 * ctx,const byte * key,word32 keySz)415 int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
416 {
417 #if defined(POLY130564) && !defined(USE_INTEL_SPEEDUP)
418 word64 t0,t1;
419 #endif
420
421 if (key == NULL)
422 return BAD_FUNC_ARG;
423
424 #ifdef CHACHA_AEAD_TEST
425 word32 k;
426 printf("Poly key used:\n");
427 for (k = 0; k < keySz; k++) {
428 printf("%02x", key[k]);
429 if ((k+1) % 8 == 0)
430 printf("\n");
431 }
432 printf("\n");
433 #endif
434
435 if (keySz != 32 || ctx == NULL)
436 return BAD_FUNC_ARG;
437
438 #ifdef USE_INTEL_SPEEDUP
439 if (!cpu_flags_set) {
440 intel_flags = cpuid_get_flags();
441 cpu_flags_set = 1;
442 }
443 SAVE_VECTOR_REGISTERS(return _svr_ret;);
444 #ifdef HAVE_INTEL_AVX2
445 if (IS_INTEL_AVX2(intel_flags))
446 poly1305_setkey_avx2(ctx, key);
447 else
448 #endif
449 poly1305_setkey_avx(ctx, key);
450 RESTORE_VECTOR_REGISTERS();
451 #elif defined(POLY130564)
452
453 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
454 t0 = U8TO64(key + 0);
455 t1 = U8TO64(key + 8);
456
457 ctx->r[0] = ( t0 ) & 0xffc0fffffff;
458 ctx->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
459 ctx->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f;
460
461 /* h (accumulator) = 0 */
462 ctx->h[0] = 0;
463 ctx->h[1] = 0;
464 ctx->h[2] = 0;
465
466 /* save pad for later */
467 ctx->pad[0] = U8TO64(key + 16);
468 ctx->pad[1] = U8TO64(key + 24);
469
470 ctx->leftover = 0;
471 ctx->finished = 0;
472
473 #else /* if not 64 bit then use 32 bit */
474
475 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
476 ctx->r[0] = (U8TO32(key + 0) ) & 0x3ffffff;
477 ctx->r[1] = (U8TO32(key + 3) >> 2) & 0x3ffff03;
478 ctx->r[2] = (U8TO32(key + 6) >> 4) & 0x3ffc0ff;
479 ctx->r[3] = (U8TO32(key + 9) >> 6) & 0x3f03fff;
480 ctx->r[4] = (U8TO32(key + 12) >> 8) & 0x00fffff;
481
482 /* h = 0 */
483 ctx->h[0] = 0;
484 ctx->h[1] = 0;
485 ctx->h[2] = 0;
486 ctx->h[3] = 0;
487 ctx->h[4] = 0;
488
489 /* save pad for later */
490 ctx->pad[0] = U8TO32(key + 16);
491 ctx->pad[1] = U8TO32(key + 20);
492 ctx->pad[2] = U8TO32(key + 24);
493 ctx->pad[3] = U8TO32(key + 28);
494
495 ctx->leftover = 0;
496 ctx->finished = 0;
497
498 #endif
499
500 return 0;
501 }
502
wc_Poly1305Final(Poly1305 * ctx,byte * mac)503 int wc_Poly1305Final(Poly1305* ctx, byte* mac)
504 {
505 #ifdef USE_INTEL_SPEEDUP
506 #elif defined(POLY130564)
507
508 word64 h0,h1,h2,c;
509 word64 g0,g1,g2;
510 word64 t0,t1;
511
512 #else
513
514 word32 h0,h1,h2,h3,h4,c;
515 word32 g0,g1,g2,g3,g4;
516 word64 f;
517 word32 mask;
518
519 #endif
520
521 if (ctx == NULL || mac == NULL)
522 return BAD_FUNC_ARG;
523
524 #ifdef USE_INTEL_SPEEDUP
525 SAVE_VECTOR_REGISTERS(return _svr_ret;);
526 #ifdef HAVE_INTEL_AVX2
527 if (IS_INTEL_AVX2(intel_flags))
528 poly1305_final_avx2(ctx, mac);
529 else
530 #endif
531 poly1305_final_avx(ctx, mac);
532 RESTORE_VECTOR_REGISTERS();
533 #elif defined(POLY130564)
534
535 /* process the remaining block */
536 if (ctx->leftover) {
537 size_t i = ctx->leftover;
538 ctx->buffer[i] = 1;
539 for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++)
540 ctx->buffer[i] = 0;
541 ctx->finished = 1;
542 poly1305_block(ctx, ctx->buffer);
543 }
544
545 /* fully carry h */
546 h0 = ctx->h[0];
547 h1 = ctx->h[1];
548 h2 = ctx->h[2];
549
550 c = (h1 >> 44); h1 &= 0xfffffffffff;
551 h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
552 h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
553 h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff;
554 h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
555 h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
556 h1 += c;
557
558 /* compute h + -p */
559 g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
560 g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
561 g2 = h2 + c - ((word64)1 << 42);
562
563 /* select h if h < p, or h + -p if h >= p */
564 c = (g2 >> ((sizeof(word64) * 8) - 1)) - 1;
565 g0 &= c;
566 g1 &= c;
567 g2 &= c;
568 c = ~c;
569 h0 = (h0 & c) | g0;
570 h1 = (h1 & c) | g1;
571 h2 = (h2 & c) | g2;
572
573 /* h = (h + pad) */
574 t0 = ctx->pad[0];
575 t1 = ctx->pad[1];
576
577 h0 += (( t0 ) & 0xfffffffffff) ;
578 c = (h0 >> 44); h0 &= 0xfffffffffff;
579 h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
580 c = (h1 >> 44); h1 &= 0xfffffffffff;
581 h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c;
582 h2 &= 0x3ffffffffff;
583
584 /* mac = h % (2^128) */
585 h0 = ((h0 ) | (h1 << 44));
586 h1 = ((h1 >> 20) | (h2 << 24));
587
588 U64TO8(mac + 0, h0);
589 U64TO8(mac + 8, h1);
590
591 /* zero out the state */
592 ctx->h[0] = 0;
593 ctx->h[1] = 0;
594 ctx->h[2] = 0;
595 ctx->r[0] = 0;
596 ctx->r[1] = 0;
597 ctx->r[2] = 0;
598 ctx->pad[0] = 0;
599 ctx->pad[1] = 0;
600
601 #else /* if not 64 bit then use 32 bit */
602
603 /* process the remaining block */
604 if (ctx->leftover) {
605 size_t i = ctx->leftover;
606 ctx->buffer[i++] = 1;
607 for (; i < POLY1305_BLOCK_SIZE; i++)
608 ctx->buffer[i] = 0;
609 ctx->finished = 1;
610 poly1305_block(ctx, ctx->buffer);
611 }
612
613 /* fully carry h */
614 h0 = ctx->h[0];
615 h1 = ctx->h[1];
616 h2 = ctx->h[2];
617 h3 = ctx->h[3];
618 h4 = ctx->h[4];
619
620 c = h1 >> 26; h1 = h1 & 0x3ffffff;
621 h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
622 h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
623 h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
624 h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
625 h1 += c;
626
627 /* compute h + -p */
628 g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
629 g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
630 g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
631 g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
632 g4 = h4 + c - ((word32)1 << 26);
633
634 /* select h if h < p, or h + -p if h >= p */
635 mask = ((word32)g4 >> ((sizeof(word32) * 8) - 1)) - 1;
636 g0 &= mask;
637 g1 &= mask;
638 g2 &= mask;
639 g3 &= mask;
640 g4 &= mask;
641 mask = ~mask;
642 h0 = (h0 & mask) | g0;
643 h1 = (h1 & mask) | g1;
644 h2 = (h2 & mask) | g2;
645 h3 = (h3 & mask) | g3;
646 h4 = (h4 & mask) | g4;
647
648 /* h = h % (2^128) */
649 h0 = ((h0 ) | (h1 << 26)) & 0xffffffff;
650 h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
651 h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
652 h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
653
654 /* mac = (h + pad) % (2^128) */
655 f = (word64)h0 + ctx->pad[0] ; h0 = (word32)f;
656 f = (word64)h1 + ctx->pad[1] + (f >> 32); h1 = (word32)f;
657 f = (word64)h2 + ctx->pad[2] + (f >> 32); h2 = (word32)f;
658 f = (word64)h3 + ctx->pad[3] + (f >> 32); h3 = (word32)f;
659
660 U32TO8(mac + 0, h0);
661 U32TO8(mac + 4, h1);
662 U32TO8(mac + 8, h2);
663 U32TO8(mac + 12, h3);
664
665 /* zero out the state */
666 ctx->h[0] = 0;
667 ctx->h[1] = 0;
668 ctx->h[2] = 0;
669 ctx->h[3] = 0;
670 ctx->h[4] = 0;
671 ctx->r[0] = 0;
672 ctx->r[1] = 0;
673 ctx->r[2] = 0;
674 ctx->r[3] = 0;
675 ctx->r[4] = 0;
676 ctx->pad[0] = 0;
677 ctx->pad[1] = 0;
678 ctx->pad[2] = 0;
679 ctx->pad[3] = 0;
680
681 #endif
682
683 return 0;
684 }
685 #endif /* !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) */
686
687
wc_Poly1305Update(Poly1305 * ctx,const byte * m,word32 bytes)688 int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
689 {
690 size_t i;
691
692 if (ctx == NULL || (m == NULL && bytes > 0))
693 return BAD_FUNC_ARG;
694
695 if (bytes == 0) {
696 /* valid, but do nothing */
697 return 0;
698 }
699 #ifdef CHACHA_AEAD_TEST
700 word32 k;
701 printf("Raw input to poly:\n");
702 for (k = 0; k < bytes; k++) {
703 printf("%02x", m[k]);
704 if ((k+1) % 16 == 0)
705 printf("\n");
706 }
707 printf("\n");
708 #endif
709
710 #ifdef USE_INTEL_SPEEDUP
711 #ifdef HAVE_INTEL_AVX2
712 if (IS_INTEL_AVX2(intel_flags)) {
713 SAVE_VECTOR_REGISTERS(return _svr_ret;);
714
715 /* handle leftover */
716
717 if (ctx->leftover) {
718 size_t want = sizeof(ctx->buffer) - ctx->leftover;
719 if (want > bytes)
720 want = bytes;
721
722 for (i = 0; i < want; i++)
723 ctx->buffer[ctx->leftover + i] = m[i];
724 bytes -= (word32)want;
725 m += want;
726 ctx->leftover += want;
727 if (ctx->leftover < sizeof(ctx->buffer)) {
728 RESTORE_VECTOR_REGISTERS();
729 return 0;
730 }
731
732 if (!ctx->started)
733 poly1305_calc_powers_avx2(ctx);
734 poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
735 ctx->leftover = 0;
736 }
737
738 /* process full blocks */
739 if (bytes >= sizeof(ctx->buffer)) {
740 size_t want = bytes & ~(sizeof(ctx->buffer) - 1);
741
742 if (!ctx->started)
743 poly1305_calc_powers_avx2(ctx);
744 poly1305_blocks_avx2(ctx, m, want);
745 m += want;
746 bytes -= (word32)want;
747 }
748
749 /* store leftover */
750 if (bytes) {
751 for (i = 0; i < bytes; i++)
752 ctx->buffer[ctx->leftover + i] = m[i];
753 ctx->leftover += bytes;
754 }
755 RESTORE_VECTOR_REGISTERS();
756 }
757 else
758 #endif
759 #endif
760 {
761 /* handle leftover */
762 if (ctx->leftover) {
763 size_t want = (POLY1305_BLOCK_SIZE - ctx->leftover);
764 if (want > bytes)
765 want = bytes;
766 for (i = 0; i < want; i++)
767 ctx->buffer[ctx->leftover + i] = m[i];
768 bytes -= (word32)want;
769 m += want;
770 ctx->leftover += want;
771 if (ctx->leftover < POLY1305_BLOCK_SIZE)
772 return 0;
773 poly1305_block(ctx, ctx->buffer);
774 ctx->leftover = 0;
775 }
776
777 /* process full blocks */
778 if (bytes >= POLY1305_BLOCK_SIZE) {
779 size_t want = (bytes & ~(POLY1305_BLOCK_SIZE - 1));
780 #if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
781 int ret;
782 ret = poly1305_blocks(ctx, m, want);
783 if (ret != 0)
784 return ret;
785 #else
786 poly1305_blocks(ctx, m, want);
787 #endif
788 m += want;
789 bytes -= (word32)want;
790 }
791
792 /* store leftover */
793 if (bytes) {
794 for (i = 0; i < bytes; i++)
795 ctx->buffer[ctx->leftover + i] = m[i];
796 ctx->leftover += bytes;
797 }
798 }
799
800 return 0;
801 }
802
803 /* Takes a Poly1305 struct that has a key loaded and pads the provided length
804 ctx : Initialized Poly1305 struct to use
805 lenToPad : Current number of bytes updated that needs padding to 16
806 */
wc_Poly1305_Pad(Poly1305 * ctx,word32 lenToPad)807 int wc_Poly1305_Pad(Poly1305* ctx, word32 lenToPad)
808 {
809 int ret = 0;
810 word32 paddingLen;
811 byte padding[WC_POLY1305_PAD_SZ - 1];
812
813 if (ctx == NULL) {
814 return BAD_FUNC_ARG;
815 }
816 if (lenToPad == 0) {
817 return 0; /* nothing needs to be done */
818 }
819
820 XMEMSET(padding, 0, sizeof(padding));
821
822 /* Pad length to 16 bytes */
823 paddingLen = (-(int)lenToPad) & (WC_POLY1305_PAD_SZ - 1);
824 if ((paddingLen > 0) && (paddingLen < WC_POLY1305_PAD_SZ)) {
825 ret = wc_Poly1305Update(ctx, padding, paddingLen);
826 }
827 return ret;
828 }
829
830 /* Takes a Poly1305 struct that has a key loaded and adds the AEAD length
831 encoding in 64-bit little endian
832 aadSz : Size of the additional authentication data
833 dataSz : Size of the plaintext or ciphertext
834 */
wc_Poly1305_EncodeSizes(Poly1305 * ctx,word32 aadSz,word32 dataSz)835 int wc_Poly1305_EncodeSizes(Poly1305* ctx, word32 aadSz, word32 dataSz)
836 {
837 int ret;
838 byte little64[16]; /* sizeof(word64) * 2 */
839
840 if (ctx == NULL) {
841 return BAD_FUNC_ARG;
842 }
843
844 XMEMSET(little64, 0, sizeof(little64));
845
846 /* size of additional data and input data as little endian 64 bit types */
847 u32tole64(aadSz, little64);
848 u32tole64(dataSz, little64 + 8);
849 ret = wc_Poly1305Update(ctx, little64, sizeof(little64));
850
851 return ret;
852 }
853
854 #ifdef WORD64_AVAILABLE
wc_Poly1305_EncodeSizes64(Poly1305 * ctx,word64 aadSz,word64 dataSz)855 int wc_Poly1305_EncodeSizes64(Poly1305* ctx, word64 aadSz, word64 dataSz)
856 {
857 int ret;
858 word64 little64[2];
859
860 if (ctx == NULL) {
861 return BAD_FUNC_ARG;
862 }
863
864 #ifdef BIG_ENDIAN_ORDER
865 little64[0] = ByteReverseWord64(aadSz);
866 little64[1] = ByteReverseWord64(dataSz);
867 #else
868 little64[0] = aadSz;
869 little64[1] = dataSz;
870 #endif
871
872 ret = wc_Poly1305Update(ctx, (byte *)little64, sizeof(little64));
873
874 return ret;
875 }
876 #endif
877
878 /* Takes in an initialized Poly1305 struct that has a key loaded and creates
879 a MAC (tag) using recent TLS AEAD padding scheme.
880 ctx : Initialized Poly1305 struct to use
881 additional : Additional data to use
882 addSz : Size of additional buffer
883 input : Input buffer to create tag from
884 sz : Size of input buffer
885 tag : Buffer to hold created tag
886 tagSz : Size of input tag buffer (must be at least
887 WC_POLY1305_MAC_SZ(16))
888 */
wc_Poly1305_MAC(Poly1305 * ctx,const byte * additional,word32 addSz,const byte * input,word32 sz,byte * tag,word32 tagSz)889 int wc_Poly1305_MAC(Poly1305* ctx, const byte* additional, word32 addSz,
890 const byte* input, word32 sz, byte* tag, word32 tagSz)
891 {
892 int ret;
893
894 /* sanity check on arguments */
895 if (ctx == NULL || input == NULL || tag == NULL ||
896 tagSz < WC_POLY1305_MAC_SZ) {
897 return BAD_FUNC_ARG;
898 }
899
900 /* additional allowed to be 0 */
901 if (addSz > 0) {
902 if (additional == NULL)
903 return BAD_FUNC_ARG;
904
905 /* additional data plus padding */
906 if ((ret = wc_Poly1305Update(ctx, additional, addSz)) != 0) {
907 return ret;
908 }
909 /* pad additional data */
910 if ((ret = wc_Poly1305_Pad(ctx, addSz)) != 0) {
911 return ret;
912 }
913 }
914
915 /* input plus padding */
916 if ((ret = wc_Poly1305Update(ctx, input, sz)) != 0) {
917 return ret;
918 }
919 /* pad input data */
920 if ((ret = wc_Poly1305_Pad(ctx, sz)) != 0) {
921 return ret;
922 }
923
924 /* encode size of AAD and input data as little endian 64 bit types */
925 if ((ret = wc_Poly1305_EncodeSizes(ctx, addSz, sz)) != 0) {
926 return ret;
927 }
928
929 /* Finalize the auth tag */
930 ret = wc_Poly1305Final(ctx, tag);
931
932 return ret;
933
934 }
935 #endif /* HAVE_POLY1305 */
936