1 /* 2 * Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved. 3 * 4 * Licensed under the Apache License 2.0 (the "License"). You may not use 5 * this file except in compliance with the License. You can obtain a copy 6 * in the file LICENSE in the source distribution or at 7 * https://www.openssl.org/source/license.html 8 */ 9 10 #include <stdlib.h> 11 #include <string.h> 12 #include <openssl/crypto.h> 13 14 #include "crypto/poly1305.h" 15 16 size_t Poly1305_ctx_size(void) 17 { 18 return sizeof(struct poly1305_context); 19 } 20 21 /* pick 32-bit unsigned integer in little endian order */ 22 static unsigned int U8TOU32(const unsigned char *p) 23 { 24 return (((unsigned int)(p[0] & 0xff)) | 25 ((unsigned int)(p[1] & 0xff) << 8) | 26 ((unsigned int)(p[2] & 0xff) << 16) | 27 ((unsigned int)(p[3] & 0xff) << 24)); 28 } 29 30 /* 31 * Implementations can be classified by amount of significant bits in 32 * words making up the multi-precision value, or in other words radix 33 * or base of numerical representation, e.g. base 2^64, base 2^32, 34 * base 2^26. Complementary characteristic is how wide is the result of 35 * multiplication of pair of digits, e.g. it would take 128 bits to 36 * accommodate multiplication result in base 2^64 case. These are used 37 * interchangeably. To describe implementation that is. But interface 38 * is designed to isolate this so that low-level primitives implemented 39 * in assembly can be self-contained/self-coherent. 40 */ 41 #ifndef POLY1305_ASM 42 /* 43 * Even though there is __int128 reference implementation targeting 44 * 64-bit platforms provided below, it's not obvious that it's optimal 45 * choice for every one of them. Depending on instruction set overall 46 * amount of instructions can be comparable to one in __int64 47 * implementation. Amount of multiplication instructions would be lower, 48 * but not necessarily overall. And in out-of-order execution context, 49 * it is the latter that can be crucial... 50 * 51 * On related note. Poly1305 author, D. J. Bernstein, discusses and 52 * provides floating-point implementations of the algorithm in question. 53 * It made a lot of sense by the time of introduction, because most 54 * then-modern processors didn't have pipelined integer multiplier. 55 * [Not to mention that some had non-constant timing for integer 56 * multiplications.] Floating-point instructions on the other hand could 57 * be issued every cycle, which allowed to achieve better performance. 58 * Nowadays, with SIMD and/or out-or-order execution, shared or 59 * even emulated FPU, it's more complicated, and floating-point 60 * implementation is not necessarily optimal choice in every situation, 61 * rather contrary... 62 * 63 * <appro@openssl.org> 64 */ 65 66 typedef unsigned int u32; 67 68 /* 69 * poly1305_blocks processes a multiple of POLY1305_BLOCK_SIZE blocks 70 * of |inp| no longer than |len|. Behaviour for |len| not divisible by 71 * block size is unspecified in general case, even though in reference 72 * implementation the trailing chunk is simply ignored. Per algorithm 73 * specification, every input block, complete or last partial, is to be 74 * padded with a bit past most significant byte. The latter kind is then 75 * padded with zeros till block size. This last partial block padding 76 * is caller(*)'s responsibility, and because of this the last partial 77 * block is always processed with separate call with |len| set to 78 * POLY1305_BLOCK_SIZE and |padbit| to 0. In all other cases |padbit| 79 * should be set to 1 to perform implicit padding with 128th bit. 80 * poly1305_blocks does not actually check for this constraint though, 81 * it's caller(*)'s responsibility to comply. 82 * 83 * (*) In the context "caller" is not application code, but higher 84 * level Poly1305_* from this very module, so that quirks are 85 * handled locally. 86 */ 87 static void 88 poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit); 89 90 /* 91 * Type-agnostic "rip-off" from constant_time.h 92 */ 93 # define CONSTANT_TIME_CARRY(a,b) ( \ 94 (a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1) \ 95 ) 96 97 # if defined(INT64_MAX) && defined(INT128_MAX) 98 99 typedef unsigned long u64; 100 typedef uint128_t u128; 101 102 typedef struct { 103 u64 h[3]; 104 u64 r[2]; 105 } poly1305_internal; 106 107 /* pick 32-bit unsigned integer in little endian order */ 108 static u64 U8TOU64(const unsigned char *p) 109 { 110 return (((u64)(p[0] & 0xff)) | 111 ((u64)(p[1] & 0xff) << 8) | 112 ((u64)(p[2] & 0xff) << 16) | 113 ((u64)(p[3] & 0xff) << 24) | 114 ((u64)(p[4] & 0xff) << 32) | 115 ((u64)(p[5] & 0xff) << 40) | 116 ((u64)(p[6] & 0xff) << 48) | 117 ((u64)(p[7] & 0xff) << 56)); 118 } 119 120 /* store a 32-bit unsigned integer in little endian */ 121 static void U64TO8(unsigned char *p, u64 v) 122 { 123 p[0] = (unsigned char)((v) & 0xff); 124 p[1] = (unsigned char)((v >> 8) & 0xff); 125 p[2] = (unsigned char)((v >> 16) & 0xff); 126 p[3] = (unsigned char)((v >> 24) & 0xff); 127 p[4] = (unsigned char)((v >> 32) & 0xff); 128 p[5] = (unsigned char)((v >> 40) & 0xff); 129 p[6] = (unsigned char)((v >> 48) & 0xff); 130 p[7] = (unsigned char)((v >> 56) & 0xff); 131 } 132 133 static void poly1305_init(void *ctx, const unsigned char key[16]) 134 { 135 poly1305_internal *st = (poly1305_internal *) ctx; 136 137 /* h = 0 */ 138 st->h[0] = 0; 139 st->h[1] = 0; 140 st->h[2] = 0; 141 142 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 143 st->r[0] = U8TOU64(&key[0]) & 0x0ffffffc0fffffff; 144 st->r[1] = U8TOU64(&key[8]) & 0x0ffffffc0ffffffc; 145 } 146 147 static void 148 poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit) 149 { 150 poly1305_internal *st = (poly1305_internal *)ctx; 151 u64 r0, r1; 152 u64 s1; 153 u64 h0, h1, h2, c; 154 u128 d0, d1; 155 156 r0 = st->r[0]; 157 r1 = st->r[1]; 158 159 s1 = r1 + (r1 >> 2); 160 161 h0 = st->h[0]; 162 h1 = st->h[1]; 163 h2 = st->h[2]; 164 165 while (len >= POLY1305_BLOCK_SIZE) { 166 /* h += m[i] */ 167 h0 = (u64)(d0 = (u128)h0 + U8TOU64(inp + 0)); 168 h1 = (u64)(d1 = (u128)h1 + (d0 >> 64) + U8TOU64(inp + 8)); 169 /* 170 * padbit can be zero only when original len was 171 * POLY1306_BLOCK_SIZE, but we don't check 172 */ 173 h2 += (u64)(d1 >> 64) + padbit; 174 175 /* h *= r "%" p, where "%" stands for "partial remainder" */ 176 d0 = ((u128)h0 * r0) + 177 ((u128)h1 * s1); 178 d1 = ((u128)h0 * r1) + 179 ((u128)h1 * r0) + 180 (h2 * s1); 181 h2 = (h2 * r0); 182 183 /* last reduction step: */ 184 /* a) h2:h0 = h2<<128 + d1<<64 + d0 */ 185 h0 = (u64)d0; 186 h1 = (u64)(d1 += d0 >> 64); 187 h2 += (u64)(d1 >> 64); 188 /* b) (h2:h0 += (h2:h0>>130) * 5) %= 2^130 */ 189 c = (h2 >> 2) + (h2 & ~3UL); 190 h2 &= 3; 191 h0 += c; 192 h1 += (c = CONSTANT_TIME_CARRY(h0,c)); 193 h2 += CONSTANT_TIME_CARRY(h1,c); 194 /* 195 * Occasional overflows to 3rd bit of h2 are taken care of 196 * "naturally". If after this point we end up at the top of 197 * this loop, then the overflow bit will be accounted for 198 * in next iteration. If we end up in poly1305_emit, then 199 * comparison to modulus below will still count as "carry 200 * into 131st bit", so that properly reduced value will be 201 * picked in conditional move. 202 */ 203 204 inp += POLY1305_BLOCK_SIZE; 205 len -= POLY1305_BLOCK_SIZE; 206 } 207 208 st->h[0] = h0; 209 st->h[1] = h1; 210 st->h[2] = h2; 211 } 212 213 static void poly1305_emit(void *ctx, unsigned char mac[16], 214 const u32 nonce[4]) 215 { 216 poly1305_internal *st = (poly1305_internal *) ctx; 217 u64 h0, h1, h2; 218 u64 g0, g1, g2; 219 u128 t; 220 u64 mask; 221 222 h0 = st->h[0]; 223 h1 = st->h[1]; 224 h2 = st->h[2]; 225 226 /* compare to modulus by computing h + -p */ 227 g0 = (u64)(t = (u128)h0 + 5); 228 g1 = (u64)(t = (u128)h1 + (t >> 64)); 229 g2 = h2 + (u64)(t >> 64); 230 231 /* if there was carry into 131st bit, h1:h0 = g1:g0 */ 232 mask = 0 - (g2 >> 2); 233 g0 &= mask; 234 g1 &= mask; 235 mask = ~mask; 236 h0 = (h0 & mask) | g0; 237 h1 = (h1 & mask) | g1; 238 239 /* mac = (h + nonce) % (2^128) */ 240 h0 = (u64)(t = (u128)h0 + nonce[0] + ((u64)nonce[1]<<32)); 241 h1 = (u64)(t = (u128)h1 + nonce[2] + ((u64)nonce[3]<<32) + (t >> 64)); 242 243 U64TO8(mac + 0, h0); 244 U64TO8(mac + 8, h1); 245 } 246 247 # else 248 249 # if defined(_WIN32) && !defined(__MINGW32__) 250 typedef unsigned __int64 u64; 251 # elif defined(__arch64__) 252 typedef unsigned long u64; 253 # else 254 typedef unsigned long long u64; 255 # endif 256 257 typedef struct { 258 u32 h[5]; 259 u32 r[4]; 260 } poly1305_internal; 261 262 /* store a 32-bit unsigned integer in little endian */ 263 static void U32TO8(unsigned char *p, unsigned int v) 264 { 265 p[0] = (unsigned char)((v) & 0xff); 266 p[1] = (unsigned char)((v >> 8) & 0xff); 267 p[2] = (unsigned char)((v >> 16) & 0xff); 268 p[3] = (unsigned char)((v >> 24) & 0xff); 269 } 270 271 static void poly1305_init(void *ctx, const unsigned char key[16]) 272 { 273 poly1305_internal *st = (poly1305_internal *) ctx; 274 275 /* h = 0 */ 276 st->h[0] = 0; 277 st->h[1] = 0; 278 st->h[2] = 0; 279 st->h[3] = 0; 280 st->h[4] = 0; 281 282 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 283 st->r[0] = U8TOU32(&key[0]) & 0x0fffffff; 284 st->r[1] = U8TOU32(&key[4]) & 0x0ffffffc; 285 st->r[2] = U8TOU32(&key[8]) & 0x0ffffffc; 286 st->r[3] = U8TOU32(&key[12]) & 0x0ffffffc; 287 } 288 289 static void 290 poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit) 291 { 292 poly1305_internal *st = (poly1305_internal *)ctx; 293 u32 r0, r1, r2, r3; 294 u32 s1, s2, s3; 295 u32 h0, h1, h2, h3, h4, c; 296 u64 d0, d1, d2, d3; 297 298 r0 = st->r[0]; 299 r1 = st->r[1]; 300 r2 = st->r[2]; 301 r3 = st->r[3]; 302 303 s1 = r1 + (r1 >> 2); 304 s2 = r2 + (r2 >> 2); 305 s3 = r3 + (r3 >> 2); 306 307 h0 = st->h[0]; 308 h1 = st->h[1]; 309 h2 = st->h[2]; 310 h3 = st->h[3]; 311 h4 = st->h[4]; 312 313 while (len >= POLY1305_BLOCK_SIZE) { 314 /* h += m[i] */ 315 h0 = (u32)(d0 = (u64)h0 + U8TOU32(inp + 0)); 316 h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + U8TOU32(inp + 4)); 317 h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + U8TOU32(inp + 8)); 318 h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + U8TOU32(inp + 12)); 319 h4 += (u32)(d3 >> 32) + padbit; 320 321 /* h *= r "%" p, where "%" stands for "partial remainder" */ 322 d0 = ((u64)h0 * r0) + 323 ((u64)h1 * s3) + 324 ((u64)h2 * s2) + 325 ((u64)h3 * s1); 326 d1 = ((u64)h0 * r1) + 327 ((u64)h1 * r0) + 328 ((u64)h2 * s3) + 329 ((u64)h3 * s2) + 330 (h4 * s1); 331 d2 = ((u64)h0 * r2) + 332 ((u64)h1 * r1) + 333 ((u64)h2 * r0) + 334 ((u64)h3 * s3) + 335 (h4 * s2); 336 d3 = ((u64)h0 * r3) + 337 ((u64)h1 * r2) + 338 ((u64)h2 * r1) + 339 ((u64)h3 * r0) + 340 (h4 * s3); 341 h4 = (h4 * r0); 342 343 /* last reduction step: */ 344 /* a) h4:h0 = h4<<128 + d3<<96 + d2<<64 + d1<<32 + d0 */ 345 h0 = (u32)d0; 346 h1 = (u32)(d1 += d0 >> 32); 347 h2 = (u32)(d2 += d1 >> 32); 348 h3 = (u32)(d3 += d2 >> 32); 349 h4 += (u32)(d3 >> 32); 350 /* b) (h4:h0 += (h4:h0>>130) * 5) %= 2^130 */ 351 c = (h4 >> 2) + (h4 & ~3U); 352 h4 &= 3; 353 h0 += c; 354 h1 += (c = CONSTANT_TIME_CARRY(h0,c)); 355 h2 += (c = CONSTANT_TIME_CARRY(h1,c)); 356 h3 += (c = CONSTANT_TIME_CARRY(h2,c)); 357 h4 += CONSTANT_TIME_CARRY(h3,c); 358 /* 359 * Occasional overflows to 3rd bit of h4 are taken care of 360 * "naturally". If after this point we end up at the top of 361 * this loop, then the overflow bit will be accounted for 362 * in next iteration. If we end up in poly1305_emit, then 363 * comparison to modulus below will still count as "carry 364 * into 131st bit", so that properly reduced value will be 365 * picked in conditional move. 366 */ 367 368 inp += POLY1305_BLOCK_SIZE; 369 len -= POLY1305_BLOCK_SIZE; 370 } 371 372 st->h[0] = h0; 373 st->h[1] = h1; 374 st->h[2] = h2; 375 st->h[3] = h3; 376 st->h[4] = h4; 377 } 378 379 static void poly1305_emit(void *ctx, unsigned char mac[16], 380 const u32 nonce[4]) 381 { 382 poly1305_internal *st = (poly1305_internal *) ctx; 383 u32 h0, h1, h2, h3, h4; 384 u32 g0, g1, g2, g3, g4; 385 u64 t; 386 u32 mask; 387 388 h0 = st->h[0]; 389 h1 = st->h[1]; 390 h2 = st->h[2]; 391 h3 = st->h[3]; 392 h4 = st->h[4]; 393 394 /* compare to modulus by computing h + -p */ 395 g0 = (u32)(t = (u64)h0 + 5); 396 g1 = (u32)(t = (u64)h1 + (t >> 32)); 397 g2 = (u32)(t = (u64)h2 + (t >> 32)); 398 g3 = (u32)(t = (u64)h3 + (t >> 32)); 399 g4 = h4 + (u32)(t >> 32); 400 401 /* if there was carry into 131st bit, h3:h0 = g3:g0 */ 402 mask = 0 - (g4 >> 2); 403 g0 &= mask; 404 g1 &= mask; 405 g2 &= mask; 406 g3 &= mask; 407 mask = ~mask; 408 h0 = (h0 & mask) | g0; 409 h1 = (h1 & mask) | g1; 410 h2 = (h2 & mask) | g2; 411 h3 = (h3 & mask) | g3; 412 413 /* mac = (h + nonce) % (2^128) */ 414 h0 = (u32)(t = (u64)h0 + nonce[0]); 415 h1 = (u32)(t = (u64)h1 + (t >> 32) + nonce[1]); 416 h2 = (u32)(t = (u64)h2 + (t >> 32) + nonce[2]); 417 h3 = (u32)(t = (u64)h3 + (t >> 32) + nonce[3]); 418 419 U32TO8(mac + 0, h0); 420 U32TO8(mac + 4, h1); 421 U32TO8(mac + 8, h2); 422 U32TO8(mac + 12, h3); 423 } 424 # endif 425 #else 426 int poly1305_init(void *ctx, const unsigned char key[16], void *func); 427 void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, 428 unsigned int padbit); 429 void poly1305_emit(void *ctx, unsigned char mac[16], 430 const unsigned int nonce[4]); 431 #endif 432 433 void Poly1305_Init(POLY1305 *ctx, const unsigned char key[32]) 434 { 435 ctx->nonce[0] = U8TOU32(&key[16]); 436 ctx->nonce[1] = U8TOU32(&key[20]); 437 ctx->nonce[2] = U8TOU32(&key[24]); 438 ctx->nonce[3] = U8TOU32(&key[28]); 439 440 #ifndef POLY1305_ASM 441 poly1305_init(ctx->opaque, key); 442 #else 443 /* 444 * Unlike reference poly1305_init assembly counterpart is expected 445 * to return a value: non-zero if it initializes ctx->func, and zero 446 * otherwise. Latter is to simplify assembly in cases when there no 447 * multiple code paths to switch between. 448 */ 449 if (!poly1305_init(ctx->opaque, key, &ctx->func)) { 450 ctx->func.blocks = poly1305_blocks; 451 ctx->func.emit = poly1305_emit; 452 } 453 #endif 454 455 ctx->num = 0; 456 457 } 458 459 #ifdef POLY1305_ASM 460 /* 461 * This "eclipses" poly1305_blocks and poly1305_emit, but it's 462 * conscious choice imposed by -Wshadow compiler warnings. 463 */ 464 # define poly1305_blocks (*poly1305_blocks_p) 465 # define poly1305_emit (*poly1305_emit_p) 466 #endif 467 468 void Poly1305_Update(POLY1305 *ctx, const unsigned char *inp, size_t len) 469 { 470 #ifdef POLY1305_ASM 471 /* 472 * As documented, poly1305_blocks is never called with input 473 * longer than single block and padbit argument set to 0. This 474 * property is fluently used in assembly modules to optimize 475 * padbit handling on loop boundary. 476 */ 477 poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks; 478 #endif 479 size_t rem, num; 480 481 if ((num = ctx->num)) { 482 rem = POLY1305_BLOCK_SIZE - num; 483 if (len >= rem) { 484 memcpy(ctx->data + num, inp, rem); 485 poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 1); 486 inp += rem; 487 len -= rem; 488 } else { 489 /* Still not enough data to process a block. */ 490 memcpy(ctx->data + num, inp, len); 491 ctx->num = num + len; 492 return; 493 } 494 } 495 496 rem = len % POLY1305_BLOCK_SIZE; 497 len -= rem; 498 499 if (len >= POLY1305_BLOCK_SIZE) { 500 poly1305_blocks(ctx->opaque, inp, len, 1); 501 inp += len; 502 } 503 504 if (rem) 505 memcpy(ctx->data, inp, rem); 506 507 ctx->num = rem; 508 } 509 510 void Poly1305_Final(POLY1305 *ctx, unsigned char mac[16]) 511 { 512 #ifdef POLY1305_ASM 513 poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks; 514 poly1305_emit_f poly1305_emit_p = ctx->func.emit; 515 #endif 516 size_t num; 517 518 if ((num = ctx->num)) { 519 ctx->data[num++] = 1; /* pad bit */ 520 while (num < POLY1305_BLOCK_SIZE) 521 ctx->data[num++] = 0; 522 poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0); 523 } 524 525 poly1305_emit(ctx->opaque, mac, ctx->nonce); 526 527 /* zero out the state */ 528 OPENSSL_cleanse(ctx, sizeof(*ctx)); 529 } 530