1 /* 2 * Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved. 3 * 4 * Licensed under the OpenSSL license (the "License"). You may not use 5 * this file except in compliance with the License. You can obtain a copy 6 * in the file LICENSE in the source distribution or at 7 * https://www.openssl.org/source/license.html 8 */ 9 10 #include <stdlib.h> 11 #include <string.h> 12 #include <openssl/crypto.h> 13 14 #include "crypto/poly1305.h" 15 #include "poly1305_local.h" 16 17 size_t Poly1305_ctx_size(void) 18 { 19 return sizeof(struct poly1305_context); 20 } 21 22 /* pick 32-bit unsigned integer in little endian order */ 23 static unsigned int U8TOU32(const unsigned char *p) 24 { 25 return (((unsigned int)(p[0] & 0xff)) | 26 ((unsigned int)(p[1] & 0xff) << 8) | 27 ((unsigned int)(p[2] & 0xff) << 16) | 28 ((unsigned int)(p[3] & 0xff) << 24)); 29 } 30 31 /* 32 * Implementations can be classified by amount of significant bits in 33 * words making up the multi-precision value, or in other words radix 34 * or base of numerical representation, e.g. base 2^64, base 2^32, 35 * base 2^26. Complementary characteristic is how wide is the result of 36 * multiplication of pair of digits, e.g. it would take 128 bits to 37 * accommodate multiplication result in base 2^64 case. These are used 38 * interchangeably. To describe implementation that is. But interface 39 * is designed to isolate this so that low-level primitives implemented 40 * in assembly can be self-contained/self-coherent. 41 */ 42 #ifndef POLY1305_ASM 43 /* 44 * Even though there is __int128 reference implementation targeting 45 * 64-bit platforms provided below, it's not obvious that it's optimal 46 * choice for every one of them. Depending on instruction set overall 47 * amount of instructions can be comparable to one in __int64 48 * implementation. Amount of multiplication instructions would be lower, 49 * but not necessarily overall. And in out-of-order execution context, 50 * it is the latter that can be crucial... 51 * 52 * On related note. Poly1305 author, D. J. Bernstein, discusses and 53 * provides floating-point implementations of the algorithm in question. 54 * It made a lot of sense by the time of introduction, because most 55 * then-modern processors didn't have pipelined integer multiplier. 56 * [Not to mention that some had non-constant timing for integer 57 * multiplications.] Floating-point instructions on the other hand could 58 * be issued every cycle, which allowed to achieve better performance. 59 * Nowadays, with SIMD and/or out-or-order execution, shared or 60 * even emulated FPU, it's more complicated, and floating-point 61 * implementation is not necessarily optimal choice in every situation, 62 * rather contrary... 63 * 64 * <appro@openssl.org> 65 */ 66 67 typedef unsigned int u32; 68 69 /* 70 * poly1305_blocks processes a multiple of POLY1305_BLOCK_SIZE blocks 71 * of |inp| no longer than |len|. Behaviour for |len| not divisible by 72 * block size is unspecified in general case, even though in reference 73 * implementation the trailing chunk is simply ignored. Per algorithm 74 * specification, every input block, complete or last partial, is to be 75 * padded with a bit past most significant byte. The latter kind is then 76 * padded with zeros till block size. This last partial block padding 77 * is caller(*)'s responsibility, and because of this the last partial 78 * block is always processed with separate call with |len| set to 79 * POLY1305_BLOCK_SIZE and |padbit| to 0. In all other cases |padbit| 80 * should be set to 1 to perform implicit padding with 128th bit. 81 * poly1305_blocks does not actually check for this constraint though, 82 * it's caller(*)'s responsibility to comply. 83 * 84 * (*) In the context "caller" is not application code, but higher 85 * level Poly1305_* from this very module, so that quirks are 86 * handled locally. 87 */ 88 static void 89 poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit); 90 91 /* 92 * Type-agnostic "rip-off" from constant_time.h 93 */ 94 # define CONSTANT_TIME_CARRY(a,b) ( \ 95 (a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1) \ 96 ) 97 98 # if (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) && \ 99 (defined(__SIZEOF_LONG__) && __SIZEOF_LONG__==8) 100 101 typedef unsigned long u64; 102 typedef __uint128_t u128; 103 104 typedef struct { 105 u64 h[3]; 106 u64 r[2]; 107 } poly1305_internal; 108 109 /* pick 32-bit unsigned integer in little endian order */ 110 static u64 U8TOU64(const unsigned char *p) 111 { 112 return (((u64)(p[0] & 0xff)) | 113 ((u64)(p[1] & 0xff) << 8) | 114 ((u64)(p[2] & 0xff) << 16) | 115 ((u64)(p[3] & 0xff) << 24) | 116 ((u64)(p[4] & 0xff) << 32) | 117 ((u64)(p[5] & 0xff) << 40) | 118 ((u64)(p[6] & 0xff) << 48) | 119 ((u64)(p[7] & 0xff) << 56)); 120 } 121 122 /* store a 32-bit unsigned integer in little endian */ 123 static void U64TO8(unsigned char *p, u64 v) 124 { 125 p[0] = (unsigned char)((v) & 0xff); 126 p[1] = (unsigned char)((v >> 8) & 0xff); 127 p[2] = (unsigned char)((v >> 16) & 0xff); 128 p[3] = (unsigned char)((v >> 24) & 0xff); 129 p[4] = (unsigned char)((v >> 32) & 0xff); 130 p[5] = (unsigned char)((v >> 40) & 0xff); 131 p[6] = (unsigned char)((v >> 48) & 0xff); 132 p[7] = (unsigned char)((v >> 56) & 0xff); 133 } 134 135 static void poly1305_init(void *ctx, const unsigned char key[16]) 136 { 137 poly1305_internal *st = (poly1305_internal *) ctx; 138 139 /* h = 0 */ 140 st->h[0] = 0; 141 st->h[1] = 0; 142 st->h[2] = 0; 143 144 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 145 st->r[0] = U8TOU64(&key[0]) & 0x0ffffffc0fffffff; 146 st->r[1] = U8TOU64(&key[8]) & 0x0ffffffc0ffffffc; 147 } 148 149 static void 150 poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit) 151 { 152 poly1305_internal *st = (poly1305_internal *)ctx; 153 u64 r0, r1; 154 u64 s1; 155 u64 h0, h1, h2, c; 156 u128 d0, d1; 157 158 r0 = st->r[0]; 159 r1 = st->r[1]; 160 161 s1 = r1 + (r1 >> 2); 162 163 h0 = st->h[0]; 164 h1 = st->h[1]; 165 h2 = st->h[2]; 166 167 while (len >= POLY1305_BLOCK_SIZE) { 168 /* h += m[i] */ 169 h0 = (u64)(d0 = (u128)h0 + U8TOU64(inp + 0)); 170 h1 = (u64)(d1 = (u128)h1 + (d0 >> 64) + U8TOU64(inp + 8)); 171 /* 172 * padbit can be zero only when original len was 173 * POLY1306_BLOCK_SIZE, but we don't check 174 */ 175 h2 += (u64)(d1 >> 64) + padbit; 176 177 /* h *= r "%" p, where "%" stands for "partial remainder" */ 178 d0 = ((u128)h0 * r0) + 179 ((u128)h1 * s1); 180 d1 = ((u128)h0 * r1) + 181 ((u128)h1 * r0) + 182 (h2 * s1); 183 h2 = (h2 * r0); 184 185 /* last reduction step: */ 186 /* a) h2:h0 = h2<<128 + d1<<64 + d0 */ 187 h0 = (u64)d0; 188 h1 = (u64)(d1 += d0 >> 64); 189 h2 += (u64)(d1 >> 64); 190 /* b) (h2:h0 += (h2:h0>>130) * 5) %= 2^130 */ 191 c = (h2 >> 2) + (h2 & ~3UL); 192 h2 &= 3; 193 h0 += c; 194 h1 += (c = CONSTANT_TIME_CARRY(h0,c)); 195 h2 += CONSTANT_TIME_CARRY(h1,c); 196 /* 197 * Occasional overflows to 3rd bit of h2 are taken care of 198 * "naturally". If after this point we end up at the top of 199 * this loop, then the overflow bit will be accounted for 200 * in next iteration. If we end up in poly1305_emit, then 201 * comparison to modulus below will still count as "carry 202 * into 131st bit", so that properly reduced value will be 203 * picked in conditional move. 204 */ 205 206 inp += POLY1305_BLOCK_SIZE; 207 len -= POLY1305_BLOCK_SIZE; 208 } 209 210 st->h[0] = h0; 211 st->h[1] = h1; 212 st->h[2] = h2; 213 } 214 215 static void poly1305_emit(void *ctx, unsigned char mac[16], 216 const u32 nonce[4]) 217 { 218 poly1305_internal *st = (poly1305_internal *) ctx; 219 u64 h0, h1, h2; 220 u64 g0, g1, g2; 221 u128 t; 222 u64 mask; 223 224 h0 = st->h[0]; 225 h1 = st->h[1]; 226 h2 = st->h[2]; 227 228 /* compare to modulus by computing h + -p */ 229 g0 = (u64)(t = (u128)h0 + 5); 230 g1 = (u64)(t = (u128)h1 + (t >> 64)); 231 g2 = h2 + (u64)(t >> 64); 232 233 /* if there was carry into 131st bit, h1:h0 = g1:g0 */ 234 mask = 0 - (g2 >> 2); 235 g0 &= mask; 236 g1 &= mask; 237 mask = ~mask; 238 h0 = (h0 & mask) | g0; 239 h1 = (h1 & mask) | g1; 240 241 /* mac = (h + nonce) % (2^128) */ 242 h0 = (u64)(t = (u128)h0 + nonce[0] + ((u64)nonce[1]<<32)); 243 h1 = (u64)(t = (u128)h1 + nonce[2] + ((u64)nonce[3]<<32) + (t >> 64)); 244 245 U64TO8(mac + 0, h0); 246 U64TO8(mac + 8, h1); 247 } 248 249 # else 250 251 # if defined(_WIN32) && !defined(__MINGW32__) 252 typedef unsigned __int64 u64; 253 # elif defined(__arch64__) 254 typedef unsigned long u64; 255 # else 256 typedef unsigned long long u64; 257 # endif 258 259 typedef struct { 260 u32 h[5]; 261 u32 r[4]; 262 } poly1305_internal; 263 264 /* store a 32-bit unsigned integer in little endian */ 265 static void U32TO8(unsigned char *p, unsigned int v) 266 { 267 p[0] = (unsigned char)((v) & 0xff); 268 p[1] = (unsigned char)((v >> 8) & 0xff); 269 p[2] = (unsigned char)((v >> 16) & 0xff); 270 p[3] = (unsigned char)((v >> 24) & 0xff); 271 } 272 273 static void poly1305_init(void *ctx, const unsigned char key[16]) 274 { 275 poly1305_internal *st = (poly1305_internal *) ctx; 276 277 /* h = 0 */ 278 st->h[0] = 0; 279 st->h[1] = 0; 280 st->h[2] = 0; 281 st->h[3] = 0; 282 st->h[4] = 0; 283 284 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 285 st->r[0] = U8TOU32(&key[0]) & 0x0fffffff; 286 st->r[1] = U8TOU32(&key[4]) & 0x0ffffffc; 287 st->r[2] = U8TOU32(&key[8]) & 0x0ffffffc; 288 st->r[3] = U8TOU32(&key[12]) & 0x0ffffffc; 289 } 290 291 static void 292 poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit) 293 { 294 poly1305_internal *st = (poly1305_internal *)ctx; 295 u32 r0, r1, r2, r3; 296 u32 s1, s2, s3; 297 u32 h0, h1, h2, h3, h4, c; 298 u64 d0, d1, d2, d3; 299 300 r0 = st->r[0]; 301 r1 = st->r[1]; 302 r2 = st->r[2]; 303 r3 = st->r[3]; 304 305 s1 = r1 + (r1 >> 2); 306 s2 = r2 + (r2 >> 2); 307 s3 = r3 + (r3 >> 2); 308 309 h0 = st->h[0]; 310 h1 = st->h[1]; 311 h2 = st->h[2]; 312 h3 = st->h[3]; 313 h4 = st->h[4]; 314 315 while (len >= POLY1305_BLOCK_SIZE) { 316 /* h += m[i] */ 317 h0 = (u32)(d0 = (u64)h0 + U8TOU32(inp + 0)); 318 h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + U8TOU32(inp + 4)); 319 h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + U8TOU32(inp + 8)); 320 h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + U8TOU32(inp + 12)); 321 h4 += (u32)(d3 >> 32) + padbit; 322 323 /* h *= r "%" p, where "%" stands for "partial remainder" */ 324 d0 = ((u64)h0 * r0) + 325 ((u64)h1 * s3) + 326 ((u64)h2 * s2) + 327 ((u64)h3 * s1); 328 d1 = ((u64)h0 * r1) + 329 ((u64)h1 * r0) + 330 ((u64)h2 * s3) + 331 ((u64)h3 * s2) + 332 (h4 * s1); 333 d2 = ((u64)h0 * r2) + 334 ((u64)h1 * r1) + 335 ((u64)h2 * r0) + 336 ((u64)h3 * s3) + 337 (h4 * s2); 338 d3 = ((u64)h0 * r3) + 339 ((u64)h1 * r2) + 340 ((u64)h2 * r1) + 341 ((u64)h3 * r0) + 342 (h4 * s3); 343 h4 = (h4 * r0); 344 345 /* last reduction step: */ 346 /* a) h4:h0 = h4<<128 + d3<<96 + d2<<64 + d1<<32 + d0 */ 347 h0 = (u32)d0; 348 h1 = (u32)(d1 += d0 >> 32); 349 h2 = (u32)(d2 += d1 >> 32); 350 h3 = (u32)(d3 += d2 >> 32); 351 h4 += (u32)(d3 >> 32); 352 /* b) (h4:h0 += (h4:h0>>130) * 5) %= 2^130 */ 353 c = (h4 >> 2) + (h4 & ~3U); 354 h4 &= 3; 355 h0 += c; 356 h1 += (c = CONSTANT_TIME_CARRY(h0,c)); 357 h2 += (c = CONSTANT_TIME_CARRY(h1,c)); 358 h3 += (c = CONSTANT_TIME_CARRY(h2,c)); 359 h4 += CONSTANT_TIME_CARRY(h3,c); 360 /* 361 * Occasional overflows to 3rd bit of h4 are taken care of 362 * "naturally". If after this point we end up at the top of 363 * this loop, then the overflow bit will be accounted for 364 * in next iteration. If we end up in poly1305_emit, then 365 * comparison to modulus below will still count as "carry 366 * into 131st bit", so that properly reduced value will be 367 * picked in conditional move. 368 */ 369 370 inp += POLY1305_BLOCK_SIZE; 371 len -= POLY1305_BLOCK_SIZE; 372 } 373 374 st->h[0] = h0; 375 st->h[1] = h1; 376 st->h[2] = h2; 377 st->h[3] = h3; 378 st->h[4] = h4; 379 } 380 381 static void poly1305_emit(void *ctx, unsigned char mac[16], 382 const u32 nonce[4]) 383 { 384 poly1305_internal *st = (poly1305_internal *) ctx; 385 u32 h0, h1, h2, h3, h4; 386 u32 g0, g1, g2, g3, g4; 387 u64 t; 388 u32 mask; 389 390 h0 = st->h[0]; 391 h1 = st->h[1]; 392 h2 = st->h[2]; 393 h3 = st->h[3]; 394 h4 = st->h[4]; 395 396 /* compare to modulus by computing h + -p */ 397 g0 = (u32)(t = (u64)h0 + 5); 398 g1 = (u32)(t = (u64)h1 + (t >> 32)); 399 g2 = (u32)(t = (u64)h2 + (t >> 32)); 400 g3 = (u32)(t = (u64)h3 + (t >> 32)); 401 g4 = h4 + (u32)(t >> 32); 402 403 /* if there was carry into 131st bit, h3:h0 = g3:g0 */ 404 mask = 0 - (g4 >> 2); 405 g0 &= mask; 406 g1 &= mask; 407 g2 &= mask; 408 g3 &= mask; 409 mask = ~mask; 410 h0 = (h0 & mask) | g0; 411 h1 = (h1 & mask) | g1; 412 h2 = (h2 & mask) | g2; 413 h3 = (h3 & mask) | g3; 414 415 /* mac = (h + nonce) % (2^128) */ 416 h0 = (u32)(t = (u64)h0 + nonce[0]); 417 h1 = (u32)(t = (u64)h1 + (t >> 32) + nonce[1]); 418 h2 = (u32)(t = (u64)h2 + (t >> 32) + nonce[2]); 419 h3 = (u32)(t = (u64)h3 + (t >> 32) + nonce[3]); 420 421 U32TO8(mac + 0, h0); 422 U32TO8(mac + 4, h1); 423 U32TO8(mac + 8, h2); 424 U32TO8(mac + 12, h3); 425 } 426 # endif 427 #else 428 int poly1305_init(void *ctx, const unsigned char key[16], void *func); 429 void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, 430 unsigned int padbit); 431 void poly1305_emit(void *ctx, unsigned char mac[16], 432 const unsigned int nonce[4]); 433 #endif 434 435 void Poly1305_Init(POLY1305 *ctx, const unsigned char key[32]) 436 { 437 ctx->nonce[0] = U8TOU32(&key[16]); 438 ctx->nonce[1] = U8TOU32(&key[20]); 439 ctx->nonce[2] = U8TOU32(&key[24]); 440 ctx->nonce[3] = U8TOU32(&key[28]); 441 442 #ifndef POLY1305_ASM 443 poly1305_init(ctx->opaque, key); 444 #else 445 /* 446 * Unlike reference poly1305_init assembly counterpart is expected 447 * to return a value: non-zero if it initializes ctx->func, and zero 448 * otherwise. Latter is to simplify assembly in cases when there no 449 * multiple code paths to switch between. 450 */ 451 if (!poly1305_init(ctx->opaque, key, &ctx->func)) { 452 ctx->func.blocks = poly1305_blocks; 453 ctx->func.emit = poly1305_emit; 454 } 455 #endif 456 457 ctx->num = 0; 458 459 } 460 461 #ifdef POLY1305_ASM 462 /* 463 * This "eclipses" poly1305_blocks and poly1305_emit, but it's 464 * conscious choice imposed by -Wshadow compiler warnings. 465 */ 466 # define poly1305_blocks (*poly1305_blocks_p) 467 # define poly1305_emit (*poly1305_emit_p) 468 #endif 469 470 void Poly1305_Update(POLY1305 *ctx, const unsigned char *inp, size_t len) 471 { 472 #ifdef POLY1305_ASM 473 /* 474 * As documented, poly1305_blocks is never called with input 475 * longer than single block and padbit argument set to 0. This 476 * property is fluently used in assembly modules to optimize 477 * padbit handling on loop boundary. 478 */ 479 poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks; 480 #endif 481 size_t rem, num; 482 483 if ((num = ctx->num)) { 484 rem = POLY1305_BLOCK_SIZE - num; 485 if (len >= rem) { 486 memcpy(ctx->data + num, inp, rem); 487 poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 1); 488 inp += rem; 489 len -= rem; 490 } else { 491 /* Still not enough data to process a block. */ 492 memcpy(ctx->data + num, inp, len); 493 ctx->num = num + len; 494 return; 495 } 496 } 497 498 rem = len % POLY1305_BLOCK_SIZE; 499 len -= rem; 500 501 if (len >= POLY1305_BLOCK_SIZE) { 502 poly1305_blocks(ctx->opaque, inp, len, 1); 503 inp += len; 504 } 505 506 if (rem) 507 memcpy(ctx->data, inp, rem); 508 509 ctx->num = rem; 510 } 511 512 void Poly1305_Final(POLY1305 *ctx, unsigned char mac[16]) 513 { 514 #ifdef POLY1305_ASM 515 poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks; 516 poly1305_emit_f poly1305_emit_p = ctx->func.emit; 517 #endif 518 size_t num; 519 520 if ((num = ctx->num)) { 521 ctx->data[num++] = 1; /* pad bit */ 522 while (num < POLY1305_BLOCK_SIZE) 523 ctx->data[num++] = 0; 524 poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0); 525 } 526 527 poly1305_emit(ctx->opaque, mac, ctx->nonce); 528 529 /* zero out the state */ 530 OPENSSL_cleanse(ctx, sizeof(*ctx)); 531 } 532