1 /* poly1305.c  -  Poly1305 internals and generic implementation
2  * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
3  *
4  * This file is part of Libgcrypt.
5  *
6  * Libgcrypt is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser general Public License as
8  * published by the Free Software Foundation; either version 2.1 of
9  * the License, or (at your option) any later version.
10  *
11  * Libgcrypt is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <config.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 
25 #include "types.h"
26 #include "g10lib.h"
27 #include "cipher.h"
28 #include "bufhelp.h"
29 #include "poly1305-internal.h"
30 
31 #include "mpi-internal.h"
32 #include "longlong.h"
33 
34 
35 static const char *selftest (void);
36 
37 
38 #undef HAVE_ASM_POLY1305_BLOCKS
39 
40 
41 #undef USE_MPI_64BIT
42 #undef USE_MPI_32BIT
43 #if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_TYPE_U64)
44 # define USE_MPI_64BIT 1
45 #elif BYTES_PER_MPI_LIMB == 4
46 # define USE_MPI_32BIT 1
47 #else
48 # error please implement for this limb size.
49 #endif
50 
51 
52 /* USE_S390X_ASM indicates whether to enable zSeries code. */
53 #undef USE_S390X_ASM
54 #if BYTES_PER_MPI_LIMB == 8
55 # if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
56 #  if defined(HAVE_GCC_INLINE_ASM_S390X)
57 #   define USE_S390X_ASM 1
58 #  endif /* USE_S390X_ASM */
59 # endif
60 #endif
61 
62 
63 #ifdef USE_S390X_ASM
64 
65 #define HAVE_ASM_POLY1305_BLOCKS 1
66 
67 extern unsigned int _gcry_poly1305_s390x_blocks1(void *state,
68 						 const byte *buf, size_t len,
69 						 byte high_pad);
70 
71 static unsigned int
poly1305_blocks(poly1305_context_t * ctx,const byte * buf,size_t len,byte high_pad)72 poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
73 		 byte high_pad)
74 {
75   return _gcry_poly1305_s390x_blocks1(&ctx->state, buf, len, high_pad);
76 }
77 
78 #endif /* USE_S390X_ASM */
79 
80 
poly1305_init(poly1305_context_t * ctx,const byte key[POLY1305_KEYLEN])81 static void poly1305_init (poly1305_context_t *ctx,
82 			   const byte key[POLY1305_KEYLEN])
83 {
84   POLY1305_STATE *st = &ctx->state;
85 
86   ctx->leftover = 0;
87 
88   st->h[0] = 0;
89   st->h[1] = 0;
90   st->h[2] = 0;
91   st->h[3] = 0;
92   st->h[4] = 0;
93 
94   st->r[0] = buf_get_le32(key + 0)  & 0x0fffffff;
95   st->r[1] = buf_get_le32(key + 4)  & 0x0ffffffc;
96   st->r[2] = buf_get_le32(key + 8)  & 0x0ffffffc;
97   st->r[3] = buf_get_le32(key + 12) & 0x0ffffffc;
98 
99   st->k[0] = buf_get_le32(key + 16);
100   st->k[1] = buf_get_le32(key + 20);
101   st->k[2] = buf_get_le32(key + 24);
102   st->k[3] = buf_get_le32(key + 28);
103 }
104 
105 
106 #ifdef USE_MPI_64BIT
107 
108 #if defined (__aarch64__) && __GNUC__ >= 4
109 
110 /* A += B (armv8/aarch64) */
111 #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
112       __asm__ ("adds %0, %3, %0\n" \
113 	       "adcs %1, %4, %1\n" \
114 	       "adc  %2, %5, %2\n" \
115 	       : "+r" (A0), "+r" (A1), "+r" (A2) \
116 	       : "r" (B0), "r" (B1), "r" (B2) \
117 	       : "cc" )
118 
119 #endif /* __aarch64__ */
120 
121 #if defined (__x86_64__) && __GNUC__ >= 4
122 
123 /* A += B (x86-64) */
124 #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
125       __asm__ ("addq %3, %0\n" \
126 	       "adcq %4, %1\n" \
127 	       "adcq %5, %2\n" \
128 	       : "+r" (A0), "+r" (A1), "+r" (A2) \
129 	       : "g" (B0), "g" (B1), "g" (B2) \
130 	       : "cc" )
131 
132 #endif /* __x86_64__ */
133 
134 #if defined (__powerpc__) && __GNUC__ >= 4
135 
136 /* A += B (ppc64) */
137 #define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
138       __asm__ ("addc %0, %3, %0\n" \
139 	       "adde %1, %4, %1\n" \
140 	       "adde %2, %5, %2\n" \
141 	       : "+r" (A0), "+r" (A1), "+r" (A2) \
142 	       : "r" (B0), "r" (B1), "r" (B2) \
143 	       : "cc" )
144 
145 #endif /* __powerpc__ */
146 
147 #ifndef ADD_1305_64
148 /* A += B (generic, mpi) */
149 #  define ADD_1305_64(A2, A1, A0, B2, B1, B0) do { \
150     u64 carry; \
151     add_ssaaaa(carry, A0, 0, A0, 0, B0); \
152     add_ssaaaa(A2, A1, A2, A1, B2, B1); \
153     add_ssaaaa(A2, A1, A2, A1, 0, carry); \
154   } while (0)
155 #endif
156 
157 /* H = H * R mod 2¹³⁰-5 */
158 #define MUL_MOD_1305_64(H2, H1, H0, R1, R0, R1_MULT5) do { \
159     u64 x0_lo, x0_hi, x1_lo, x1_hi; \
160     u64 t0_lo, t0_hi, t1_lo, t1_hi; \
161     \
162     /* x = a * r (partial mod 2^130-5) */ \
163     umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
164     umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
165     \
166     umul_ppmm(t0_hi, t0_lo, H1, R1_MULT5); /* h1 * r1 mod 2^130-5 */ \
167     add_ssaaaa(x0_hi, x0_lo, x0_hi, x0_lo, t0_hi, t0_lo); \
168     umul_ppmm(t1_hi, t1_lo, H1, R0);       /* h1 * r0 */ \
169     add_ssaaaa(x1_hi, x1_lo, x1_hi, x1_lo, t1_hi, t1_lo); \
170     \
171     t1_lo = H2 * R1_MULT5; /* h2 * r1 mod 2^130-5 */ \
172     t1_hi = H2 * R0;       /* h2 * r0 */ \
173     add_ssaaaa(H0, H1, x1_hi, x1_lo, t1_hi, t1_lo); \
174     \
175     /* carry propagation */ \
176     H2 = H0 & 3; \
177     H0 = (H0 >> 2) * 5; /* msb mod 2^130-5 */ \
178     ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \
179   } while (0)
180 
181 #ifndef HAVE_ASM_POLY1305_BLOCKS
182 
183 static unsigned int
poly1305_blocks(poly1305_context_t * ctx,const byte * buf,size_t len,byte high_pad)184 poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
185 		 byte high_pad)
186 {
187   POLY1305_STATE *st = &ctx->state;
188   u64 r0, r1, r1_mult5;
189   u64 h0, h1, h2;
190   u64 m0, m1, m2;
191 
192   m2 = high_pad;
193 
194   h0 = st->h[0] + ((u64)st->h[1] << 32);
195   h1 = st->h[2] + ((u64)st->h[3] << 32);
196   h2 = st->h[4];
197 
198   r0 = st->r[0] + ((u64)st->r[1] << 32);
199   r1 = st->r[2] + ((u64)st->r[3] << 32);
200 
201   r1_mult5 = (r1 >> 2) + r1;
202 
203   m0 = buf_get_le64(buf + 0);
204   m1 = buf_get_le64(buf + 8);
205   buf += POLY1305_BLOCKSIZE;
206   len -= POLY1305_BLOCKSIZE;
207 
208   while (len >= POLY1305_BLOCKSIZE)
209     {
210       /* a = h + m */
211       ADD_1305_64(h2, h1, h0, m2, m1, m0);
212 
213       m0 = buf_get_le64(buf + 0);
214       m1 = buf_get_le64(buf + 8);
215 
216       /* h = a * r (partial mod 2^130-5) */
217       MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
218 
219       buf += POLY1305_BLOCKSIZE;
220       len -= POLY1305_BLOCKSIZE;
221     }
222 
223   /* a = h + m */
224   ADD_1305_64(h2, h1, h0, m2, m1, m0);
225 
226   /* h = a * r (partial mod 2^130-5) */
227   MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
228 
229   st->h[0] = h0;
230   st->h[1] = h0 >> 32;
231   st->h[2] = h1;
232   st->h[3] = h1 >> 32;
233   st->h[4] = h2;
234 
235   return 6 * sizeof (void *) + 18 * sizeof (u64);
236 }
237 
238 #endif /* !HAVE_ASM_POLY1305_BLOCKS */
239 
poly1305_final(poly1305_context_t * ctx,byte mac[POLY1305_TAGLEN])240 static unsigned int poly1305_final (poly1305_context_t *ctx,
241 				    byte mac[POLY1305_TAGLEN])
242 {
243   POLY1305_STATE *st = &ctx->state;
244   unsigned int burn = 0;
245   u64 u, carry;
246   u64 k0, k1;
247   u64 h0, h1;
248   u64 h2;
249 
250   /* process the remaining block */
251   if (ctx->leftover)
252     {
253       ctx->buffer[ctx->leftover++] = 1;
254       if (ctx->leftover < POLY1305_BLOCKSIZE)
255 	{
256 	  memset (&ctx->buffer[ctx->leftover], 0,
257 		  POLY1305_BLOCKSIZE - ctx->leftover);
258 	  ctx->leftover = POLY1305_BLOCKSIZE;
259 	}
260       burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
261     }
262 
263   h0 = st->h[0] + ((u64)st->h[1] << 32);
264   h1 = st->h[2] + ((u64)st->h[3] << 32);
265   h2 = st->h[4];
266 
267   k0 = st->k[0] + ((u64)st->k[1] << 32);
268   k1 = st->k[2] + ((u64)st->k[3] << 32);
269 
270   /* check if h is more than 2^130-5, by adding 5. */
271   add_ssaaaa(carry, u, 0, h0, 0, 5);
272   add_ssaaaa(carry, u, 0, carry, 0, h1);
273   u = (carry + h2) >> 2; /* u == 0 or 1 */
274 
275   /* minus 2^130-5 ... (+5) */
276   u = (-u) & 5;
277   add_ssaaaa(h1, h0, h1, h0, 0, u);
278 
279   /* add high part of key + h */
280   add_ssaaaa(h1, h0, h1, h0, k1, k0);
281   buf_put_le64(mac + 0, h0);
282   buf_put_le64(mac + 8, h1);
283 
284   /* burn_stack */
285   return 4 * sizeof (void *) + 7 * sizeof (u64) + burn;
286 }
287 
288 #endif /* USE_MPI_64BIT */
289 
290 #ifdef USE_MPI_32BIT
291 
292 #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
293 
294 /* HI:LO += A * B (arm) */
295 #define UMUL_ADD_32(HI, LO, A, B) \
296       __asm__ ("umlal %1, %0, %4, %5" \
297 	       : "=r" (HI), "=r" (LO) \
298 	       : "0" (HI), "1" (LO), "r" (A), "r" (B) )
299 
300 /* A += B (arm) */
301 #define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
302       __asm__ ("adds %0, %0, %5\n" \
303 	       "adcs %1, %1, %6\n" \
304 	       "adcs %2, %2, %7\n" \
305 	       "adcs %3, %3, %8\n" \
306 	       "adc %4, %4, %9\n" \
307 	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
308 	       : "r" (B0), "r" (B1), "r" (B2), "r" (B3), "r" (B4) \
309 	       : "cc" )
310 
311 #endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */
312 
313 #if defined (__i386__) && __GNUC__ >= 4
314 
315 /* A += B (i386) */
316 #define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
317       __asm__ ("addl %5, %0\n" \
318 	       "adcl %6, %1\n" \
319 	       "adcl %7, %2\n" \
320 	       "adcl %8, %3\n" \
321 	       "adcl %9, %4\n" \
322 	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
323 	       : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \
324 	       : "cc" )
325 
326 #endif /* __i386__ */
327 
328 #ifndef UMUL_ADD_32
329 /* HI:LO += A * B (generic, mpi) */
330 #  define UMUL_ADD_32(HI, LO, A, B) do { \
331     u32 t_lo, t_hi; \
332     umul_ppmm(t_hi, t_lo, A, B); \
333     add_ssaaaa(HI, LO, HI, LO, t_hi, t_lo); \
334   } while (0)
335 #endif
336 
337 #ifndef ADD_1305_32
338 /* A += B (generic, mpi) */
339 #  define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \
340     u32 carry0, carry1, carry2; \
341     add_ssaaaa(carry0, A0, 0, A0, 0, B0); \
342     add_ssaaaa(carry1, A1, 0, A1, 0, B1); \
343     add_ssaaaa(carry1, A1, carry1, A1, 0, carry0); \
344     add_ssaaaa(carry2, A2, 0, A2, 0, B2); \
345     add_ssaaaa(carry2, A2, carry2, A2, 0, carry1); \
346     add_ssaaaa(A4, A3, A4, A3, B4, B3); \
347     add_ssaaaa(A4, A3, A4, A3, 0, carry2); \
348   } while (0)
349 #endif
350 
351 /* H = H * R mod 2¹³⁰-5 */
352 #define MUL_MOD_1305_32(H4, H3, H2, H1, H0, R3, R2, R1, R0, \
353                         R3_MULT5, R2_MULT5, R1_MULT5) do { \
354     u32 x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi; \
355     u32 t0_lo, t0_hi; \
356     \
357     /* x = a * r (partial mod 2^130-5) */ \
358     umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
359     umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
360     umul_ppmm(x2_hi, x2_lo, H0, R2);  /* h0 * r2 */ \
361     umul_ppmm(x3_hi, x3_lo, H0, R3);  /* h0 * r3 */ \
362     \
363     UMUL_ADD_32(x0_hi, x0_lo, H1, R3_MULT5); /* h1 * r3 mod 2^130-5 */ \
364     UMUL_ADD_32(x1_hi, x1_lo, H1, R0);       /* h1 * r0 */ \
365     UMUL_ADD_32(x2_hi, x2_lo, H1, R1);       /* h1 * r1 */ \
366     UMUL_ADD_32(x3_hi, x3_lo, H1, R2);       /* h1 * r2 */ \
367     \
368     UMUL_ADD_32(x0_hi, x0_lo, H2, R2_MULT5); /* h2 * r2 mod 2^130-5 */ \
369     UMUL_ADD_32(x1_hi, x1_lo, H2, R3_MULT5); /* h2 * r3 mod 2^130-5 */ \
370     UMUL_ADD_32(x2_hi, x2_lo, H2, R0);       /* h2 * r0 */ \
371     UMUL_ADD_32(x3_hi, x3_lo, H2, R1);       /* h2 * r1 */ \
372     \
373     UMUL_ADD_32(x0_hi, x0_lo, H3, R1_MULT5); /* h3 * r1 mod 2^130-5 */ \
374     H1 = x0_hi; \
375     UMUL_ADD_32(x1_hi, x1_lo, H3, R2_MULT5); /* h3 * r2 mod 2^130-5 */ \
376     UMUL_ADD_32(x2_hi, x2_lo, H3, R3_MULT5); /* h3 * r3 mod 2^130-5 */ \
377     UMUL_ADD_32(x3_hi, x3_lo, H3, R0);       /* h3 * r0 */ \
378     \
379     t0_lo = H4 * R1_MULT5; /* h4 * r1 mod 2^130-5 */ \
380     t0_hi = H4 * R2_MULT5; /* h4 * r2 mod 2^130-5 */ \
381     add_ssaaaa(H2, x1_lo, x1_hi, x1_lo, 0, t0_lo); \
382     add_ssaaaa(H3, x2_lo, x2_hi, x2_lo, 0, t0_hi); \
383     t0_lo = H4 * R3_MULT5; /* h4 * r3 mod 2^130-5 */ \
384     t0_hi = H4 * R0;       /* h4 * r0 */ \
385     add_ssaaaa(H4, x3_lo, x3_hi, x3_lo, t0_hi, t0_lo); \
386     \
387     /* carry propagation */ \
388     H0 = (H4 >> 2) * 5; /* msb mod 2^130-5 */ \
389     H4 = H4 & 3; \
390     ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \
391   } while (0)
392 
393 #ifndef HAVE_ASM_POLY1305_BLOCKS
394 
395 static unsigned int
poly1305_blocks(poly1305_context_t * ctx,const byte * buf,size_t len,byte high_pad)396 poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
397 		 byte high_pad)
398 {
399   POLY1305_STATE *st = &ctx->state;
400   u32 r1_mult5, r2_mult5, r3_mult5;
401   u32 h0, h1, h2, h3, h4;
402   u32 m0, m1, m2, m3, m4;
403 
404   m4 = high_pad;
405 
406   h0 = st->h[0];
407   h1 = st->h[1];
408   h2 = st->h[2];
409   h3 = st->h[3];
410   h4 = st->h[4];
411 
412   r1_mult5 = (st->r[1] >> 2) + st->r[1];
413   r2_mult5 = (st->r[2] >> 2) + st->r[2];
414   r3_mult5 = (st->r[3] >> 2) + st->r[3];
415 
416   while (len >= POLY1305_BLOCKSIZE)
417     {
418       m0 = buf_get_le32(buf + 0);
419       m1 = buf_get_le32(buf + 4);
420       m2 = buf_get_le32(buf + 8);
421       m3 = buf_get_le32(buf + 12);
422 
423       /* a = h + m */
424       ADD_1305_32(h4, h3, h2, h1, h0, m4, m3, m2, m1, m0);
425 
426       /* h = a * r (partial mod 2^130-5) */
427       MUL_MOD_1305_32(h4, h3, h2, h1, h0,
428 		      st->r[3], st->r[2], st->r[1], st->r[0],
429 		      r3_mult5, r2_mult5, r1_mult5);
430 
431       buf += POLY1305_BLOCKSIZE;
432       len -= POLY1305_BLOCKSIZE;
433     }
434 
435   st->h[0] = h0;
436   st->h[1] = h1;
437   st->h[2] = h2;
438   st->h[3] = h3;
439   st->h[4] = h4;
440 
441   return 6 * sizeof (void *) + 28 * sizeof (u32);
442 }
443 
444 #endif /* !HAVE_ASM_POLY1305_BLOCKS */
445 
poly1305_final(poly1305_context_t * ctx,byte mac[POLY1305_TAGLEN])446 static unsigned int poly1305_final (poly1305_context_t *ctx,
447 				    byte mac[POLY1305_TAGLEN])
448 {
449   POLY1305_STATE *st = &ctx->state;
450   unsigned int burn = 0;
451   u32 carry, tmp0, tmp1, tmp2, u;
452   u32 h4, h3, h2, h1, h0;
453 
454   /* process the remaining block */
455   if (ctx->leftover)
456     {
457       ctx->buffer[ctx->leftover++] = 1;
458       if (ctx->leftover < POLY1305_BLOCKSIZE)
459 	{
460 	  memset (&ctx->buffer[ctx->leftover], 0,
461 		  POLY1305_BLOCKSIZE - ctx->leftover);
462 	  ctx->leftover = POLY1305_BLOCKSIZE;
463 	}
464       burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
465     }
466 
467   h0 = st->h[0];
468   h1 = st->h[1];
469   h2 = st->h[2];
470   h3 = st->h[3];
471   h4 = st->h[4];
472 
473   /* check if h is more than 2^130-5, by adding 5. */
474   add_ssaaaa(carry, tmp0, 0, h0, 0, 5);
475   add_ssaaaa(carry, tmp0, 0, carry, 0, h1);
476   add_ssaaaa(carry, tmp0, 0, carry, 0, h2);
477   add_ssaaaa(carry, tmp0, 0, carry, 0, h3);
478   u = (carry + h4) >> 2; /* u == 0 or 1 */
479 
480   /* minus 2^130-5 ... (+5) */
481   u = (-u) & 5;
482   add_ssaaaa(carry, h0, 0, h0, 0, u);
483   add_ssaaaa(carry, h1, 0, h1, 0, carry);
484   add_ssaaaa(carry, h2, 0, h2, 0, carry);
485   add_ssaaaa(carry, h3, 0, h3, 0, carry);
486 
487   /* add high part of key + h */
488   add_ssaaaa(tmp0, h0, 0, h0, 0, st->k[0]);
489   add_ssaaaa(tmp1, h1, 0, h1, 0, st->k[1]);
490   add_ssaaaa(tmp1, h1, tmp1, h1, 0, tmp0);
491   add_ssaaaa(tmp2, h2, 0, h2, 0, st->k[2]);
492   add_ssaaaa(tmp2, h2, tmp2, h2, 0, tmp1);
493   add_ssaaaa(carry, h3, 0, h3, 0, st->k[3]);
494   h3 += tmp2;
495 
496   buf_put_le32(mac + 0, h0);
497   buf_put_le32(mac + 4, h1);
498   buf_put_le32(mac + 8, h2);
499   buf_put_le32(mac + 12, h3);
500 
501   /* burn_stack */
502   return 4 * sizeof (void *) + 10 * sizeof (u32) + burn;
503 }
504 
505 #endif /* USE_MPI_32BIT */
506 
507 
508 unsigned int
_gcry_poly1305_update_burn(poly1305_context_t * ctx,const byte * m,size_t bytes)509 _gcry_poly1305_update_burn (poly1305_context_t *ctx, const byte *m,
510 			    size_t bytes)
511 {
512   unsigned int burn = 0;
513 
514   /* handle leftover */
515   if (ctx->leftover)
516     {
517       size_t want = (POLY1305_BLOCKSIZE - ctx->leftover);
518       if (want > bytes)
519 	want = bytes;
520       buf_cpy (ctx->buffer + ctx->leftover, m, want);
521       bytes -= want;
522       m += want;
523       ctx->leftover += want;
524       if (ctx->leftover < POLY1305_BLOCKSIZE)
525 	return 0;
526       burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
527       ctx->leftover = 0;
528     }
529 
530   /* process full blocks */
531   if (bytes >= POLY1305_BLOCKSIZE)
532     {
533       size_t nblks = bytes / POLY1305_BLOCKSIZE;
534       burn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1);
535       m += nblks * POLY1305_BLOCKSIZE;
536       bytes -= nblks * POLY1305_BLOCKSIZE;
537     }
538 
539   /* store leftover */
540   if (bytes)
541     {
542       buf_cpy (ctx->buffer + ctx->leftover, m, bytes);
543       ctx->leftover += bytes;
544     }
545 
546   return burn;
547 }
548 
549 
550 void
_gcry_poly1305_update(poly1305_context_t * ctx,const byte * m,size_t bytes)551 _gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
552 {
553   unsigned int burn;
554 
555   burn = _gcry_poly1305_update_burn (ctx, m, bytes);
556 
557   if (burn)
558     _gcry_burn_stack (burn);
559 }
560 
561 
562 void
_gcry_poly1305_finish(poly1305_context_t * ctx,byte mac[POLY1305_TAGLEN])563 _gcry_poly1305_finish (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN])
564 {
565   unsigned int burn;
566 
567   burn = poly1305_final (ctx, mac);
568 
569   _gcry_burn_stack (burn);
570 }
571 
572 
573 gcry_err_code_t
_gcry_poly1305_init(poly1305_context_t * ctx,const byte * key,size_t keylen)574 _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
575 		     size_t keylen)
576 {
577   static int initialized;
578   static const char *selftest_failed;
579 
580   if (!initialized)
581     {
582       initialized = 1;
583       selftest_failed = selftest ();
584       if (selftest_failed)
585 	log_error ("Poly1305 selftest failed (%s)\n", selftest_failed);
586     }
587 
588   if (keylen != POLY1305_KEYLEN)
589     return GPG_ERR_INV_KEYLEN;
590 
591   if (selftest_failed)
592     return GPG_ERR_SELFTEST_FAILED;
593 
594   poly1305_init (ctx, key);
595 
596   return 0;
597 }
598 
599 
600 static void
poly1305_auth(byte mac[POLY1305_TAGLEN],const byte * m,size_t bytes,const byte * key)601 poly1305_auth (byte mac[POLY1305_TAGLEN], const byte * m, size_t bytes,
602 	       const byte * key)
603 {
604   poly1305_context_t ctx;
605 
606   memset (&ctx, 0, sizeof (ctx));
607 
608   _gcry_poly1305_init (&ctx, key, POLY1305_KEYLEN);
609   _gcry_poly1305_update (&ctx, m, bytes);
610   _gcry_poly1305_finish (&ctx, mac);
611 
612   wipememory (&ctx, sizeof (ctx));
613 }
614 
615 
616 static const char *
selftest(void)617 selftest (void)
618 {
619   /* example from nacl */
620   static const byte nacl_key[POLY1305_KEYLEN] = {
621     0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91,
622     0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25,
623     0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65,
624     0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80,
625   };
626 
627   static const byte nacl_msg[131] = {
628     0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73,
629     0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce,
630     0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4,
631     0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a,
632     0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b,
633     0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72,
634     0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2,
635     0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38,
636     0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a,
637     0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae,
638     0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea,
639     0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda,
640     0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde,
641     0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3,
642     0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6,
643     0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74,
644     0xe3, 0x55, 0xa5
645   };
646 
647   static const byte nacl_mac[16] = {
648     0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5,
649     0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9
650   };
651 
652   /* generates a final value of (2^130 - 2) == 3 */
653   static const byte wrap_key[POLY1305_KEYLEN] = {
654     0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
655     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
656     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
657     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
658   };
659 
660   static const byte wrap_msg[16] = {
661     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
662     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
663   };
664 
665   static const byte wrap_mac[16] = {
666     0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
667     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
668   };
669 
670   /* mac of the macs of messages of length 0 to 256, where the key and messages
671    * have all their values set to the length
672    */
673   static const byte total_key[POLY1305_KEYLEN] = {
674     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
675     0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9,
676     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
677     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
678   };
679 
680   static const byte total_mac[16] = {
681     0x64, 0xaf, 0xe2, 0xe8, 0xd6, 0xad, 0x7b, 0xbd,
682     0xd2, 0x87, 0xf9, 0x7c, 0x44, 0x62, 0x3d, 0x39
683   };
684 
685   poly1305_context_t ctx;
686   poly1305_context_t total_ctx;
687   byte all_key[POLY1305_KEYLEN];
688   byte all_msg[256];
689   byte mac[16];
690   size_t i, j;
691 
692   memset (&ctx, 0, sizeof (ctx));
693   memset (&total_ctx, 0, sizeof (total_ctx));
694 
695   memset (mac, 0, sizeof (mac));
696   poly1305_auth (mac, nacl_msg, sizeof (nacl_msg), nacl_key);
697   if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0)
698     return "Poly1305 test 1 failed.";
699 
700   /* SSE2/AVX have a 32 byte block size, but also support 64 byte blocks, so
701    * make sure everything still works varying between them */
702   memset (mac, 0, sizeof (mac));
703   _gcry_poly1305_init (&ctx, nacl_key, POLY1305_KEYLEN);
704   _gcry_poly1305_update (&ctx, nacl_msg + 0, 32);
705   _gcry_poly1305_update (&ctx, nacl_msg + 32, 64);
706   _gcry_poly1305_update (&ctx, nacl_msg + 96, 16);
707   _gcry_poly1305_update (&ctx, nacl_msg + 112, 8);
708   _gcry_poly1305_update (&ctx, nacl_msg + 120, 4);
709   _gcry_poly1305_update (&ctx, nacl_msg + 124, 2);
710   _gcry_poly1305_update (&ctx, nacl_msg + 126, 1);
711   _gcry_poly1305_update (&ctx, nacl_msg + 127, 1);
712   _gcry_poly1305_update (&ctx, nacl_msg + 128, 1);
713   _gcry_poly1305_update (&ctx, nacl_msg + 129, 1);
714   _gcry_poly1305_update (&ctx, nacl_msg + 130, 1);
715   _gcry_poly1305_finish (&ctx, mac);
716   if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0)
717     return "Poly1305 test 2 failed.";
718 
719   memset (mac, 0, sizeof (mac));
720   poly1305_auth (mac, wrap_msg, sizeof (wrap_msg), wrap_key);
721   if (memcmp (wrap_mac, mac, sizeof (nacl_mac)) != 0)
722     return "Poly1305 test 3 failed.";
723 
724   _gcry_poly1305_init (&total_ctx, total_key, POLY1305_KEYLEN);
725   for (i = 0; i < 256; i++)
726     {
727       /* set key and message to 'i,i,i..' */
728       for (j = 0; j < sizeof (all_key); j++)
729 	all_key[j] = i;
730       for (j = 0; j < i; j++)
731 	all_msg[j] = i;
732       poly1305_auth (mac, all_msg, i, all_key);
733       _gcry_poly1305_update (&total_ctx, mac, 16);
734     }
735   _gcry_poly1305_finish (&total_ctx, mac);
736   if (memcmp (total_mac, mac, sizeof (total_mac)) != 0)
737     return "Poly1305 test 4 failed.";
738 
739   return NULL;
740 }
741