1 /* 2 * Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved. 3 * Copyright (c) 2014, Intel Corporation. All Rights Reserved. 4 * Copyright (c) 2015, CloudFlare, Inc. 5 * 6 * Licensed under the OpenSSL license (the "License"). You may not use 7 * this file except in compliance with the License. You can obtain a copy 8 * in the file LICENSE in the source distribution or at 9 * https://www.openssl.org/source/license.html 10 * 11 * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) 12 * (1) Intel Corporation, Israel Development Center, Haifa, Israel 13 * (2) University of Haifa, Israel 14 * (3) CloudFlare, Inc. 15 * 16 * Reference: 17 * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with 18 * 256 Bit Primes" 19 */ 20 21 #include <string.h> 22 23 #include "internal/cryptlib.h" 24 #include "crypto/bn.h" 25 #include "ec_local.h" 26 #include "internal/refcount.h" 27 28 #if BN_BITS2 != 64 29 # define TOBN(hi,lo) lo,hi 30 #else 31 # define TOBN(hi,lo) ((BN_ULONG)hi<<32|lo) 32 #endif 33 34 #if defined(__GNUC__) 35 # define ALIGN32 __attribute((aligned(32))) 36 #elif defined(_MSC_VER) 37 # define ALIGN32 __declspec(align(32)) 38 #else 39 # define ALIGN32 40 #endif 41 42 #define ALIGNPTR(p,N) ((unsigned char *)p+N-(size_t)p%N) 43 #define P256_LIMBS (256/BN_BITS2) 44 45 typedef unsigned short u16; 46 47 typedef struct { 48 BN_ULONG X[P256_LIMBS]; 49 BN_ULONG Y[P256_LIMBS]; 50 BN_ULONG Z[P256_LIMBS]; 51 } P256_POINT; 52 53 typedef struct { 54 BN_ULONG X[P256_LIMBS]; 55 BN_ULONG Y[P256_LIMBS]; 56 } P256_POINT_AFFINE; 57 58 typedef P256_POINT_AFFINE PRECOMP256_ROW[64]; 59 60 /* structure for precomputed multiples of the generator */ 61 struct nistz256_pre_comp_st { 62 const EC_GROUP *group; /* Parent EC_GROUP object */ 63 size_t w; /* Window size */ 64 /* 65 * Constant time access to the X and Y coordinates of the pre-computed, 66 * generator multiplies, in the Montgomery domain. Pre-calculated 67 * multiplies are stored in affine form. 68 */ 69 PRECOMP256_ROW *precomp; 70 void *precomp_storage; 71 CRYPTO_REF_COUNT references; 72 CRYPTO_RWLOCK *lock; 73 }; 74 75 /* Functions implemented in assembly */ 76 /* 77 * Most of below mentioned functions *preserve* the property of inputs 78 * being fully reduced, i.e. being in [0, modulus) range. Simply put if 79 * inputs are fully reduced, then output is too. Note that reverse is 80 * not true, in sense that given partially reduced inputs output can be 81 * either, not unlikely reduced. And "most" in first sentence refers to 82 * the fact that given the calculations flow one can tolerate that 83 * addition, 1st function below, produces partially reduced result *if* 84 * multiplications by 2 and 3, which customarily use addition, fully 85 * reduce it. This effectively gives two options: a) addition produces 86 * fully reduced result [as long as inputs are, just like remaining 87 * functions]; b) addition is allowed to produce partially reduced 88 * result, but multiplications by 2 and 3 perform additional reduction 89 * step. Choice between the two can be platform-specific, but it was a) 90 * in all cases so far... 91 */ 92 /* Modular add: res = a+b mod P */ 93 void ecp_nistz256_add(BN_ULONG res[P256_LIMBS], 94 const BN_ULONG a[P256_LIMBS], 95 const BN_ULONG b[P256_LIMBS]); 96 /* Modular mul by 2: res = 2*a mod P */ 97 void ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS], 98 const BN_ULONG a[P256_LIMBS]); 99 /* Modular mul by 3: res = 3*a mod P */ 100 void ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS], 101 const BN_ULONG a[P256_LIMBS]); 102 103 /* Modular div by 2: res = a/2 mod P */ 104 void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS], 105 const BN_ULONG a[P256_LIMBS]); 106 /* Modular sub: res = a-b mod P */ 107 void ecp_nistz256_sub(BN_ULONG res[P256_LIMBS], 108 const BN_ULONG a[P256_LIMBS], 109 const BN_ULONG b[P256_LIMBS]); 110 /* Modular neg: res = -a mod P */ 111 void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); 112 /* Montgomery mul: res = a*b*2^-256 mod P */ 113 void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], 114 const BN_ULONG a[P256_LIMBS], 115 const BN_ULONG b[P256_LIMBS]); 116 /* Montgomery sqr: res = a*a*2^-256 mod P */ 117 void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], 118 const BN_ULONG a[P256_LIMBS]); 119 /* Convert a number from Montgomery domain, by multiplying with 1 */ 120 void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS], 121 const BN_ULONG in[P256_LIMBS]); 122 /* Convert a number to Montgomery domain, by multiplying with 2^512 mod P*/ 123 void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS], 124 const BN_ULONG in[P256_LIMBS]); 125 /* Functions that perform constant time access to the precomputed tables */ 126 void ecp_nistz256_scatter_w5(P256_POINT *val, 127 const P256_POINT *in_t, int idx); 128 void ecp_nistz256_gather_w5(P256_POINT *val, 129 const P256_POINT *in_t, int idx); 130 void ecp_nistz256_scatter_w7(P256_POINT_AFFINE *val, 131 const P256_POINT_AFFINE *in_t, int idx); 132 void ecp_nistz256_gather_w7(P256_POINT_AFFINE *val, 133 const P256_POINT_AFFINE *in_t, int idx); 134 135 /* One converted into the Montgomery domain */ 136 static const BN_ULONG ONE[P256_LIMBS] = { 137 TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000), 138 TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe) 139 }; 140 141 static NISTZ256_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group); 142 143 /* Precomputed tables for the default generator */ 144 extern const PRECOMP256_ROW ecp_nistz256_precomputed[37]; 145 146 /* Recode window to a signed digit, see ecp_nistputil.c for details */ 147 static unsigned int _booth_recode_w5(unsigned int in) 148 { 149 unsigned int s, d; 150 151 s = ~((in >> 5) - 1); 152 d = (1 << 6) - in - 1; 153 d = (d & s) | (in & ~s); 154 d = (d >> 1) + (d & 1); 155 156 return (d << 1) + (s & 1); 157 } 158 159 static unsigned int _booth_recode_w7(unsigned int in) 160 { 161 unsigned int s, d; 162 163 s = ~((in >> 7) - 1); 164 d = (1 << 8) - in - 1; 165 d = (d & s) | (in & ~s); 166 d = (d >> 1) + (d & 1); 167 168 return (d << 1) + (s & 1); 169 } 170 171 static void copy_conditional(BN_ULONG dst[P256_LIMBS], 172 const BN_ULONG src[P256_LIMBS], BN_ULONG move) 173 { 174 BN_ULONG mask1 = 0-move; 175 BN_ULONG mask2 = ~mask1; 176 177 dst[0] = (src[0] & mask1) ^ (dst[0] & mask2); 178 dst[1] = (src[1] & mask1) ^ (dst[1] & mask2); 179 dst[2] = (src[2] & mask1) ^ (dst[2] & mask2); 180 dst[3] = (src[3] & mask1) ^ (dst[3] & mask2); 181 if (P256_LIMBS == 8) { 182 dst[4] = (src[4] & mask1) ^ (dst[4] & mask2); 183 dst[5] = (src[5] & mask1) ^ (dst[5] & mask2); 184 dst[6] = (src[6] & mask1) ^ (dst[6] & mask2); 185 dst[7] = (src[7] & mask1) ^ (dst[7] & mask2); 186 } 187 } 188 189 static BN_ULONG is_zero(BN_ULONG in) 190 { 191 in |= (0 - in); 192 in = ~in; 193 in >>= BN_BITS2 - 1; 194 return in; 195 } 196 197 static BN_ULONG is_equal(const BN_ULONG a[P256_LIMBS], 198 const BN_ULONG b[P256_LIMBS]) 199 { 200 BN_ULONG res; 201 202 res = a[0] ^ b[0]; 203 res |= a[1] ^ b[1]; 204 res |= a[2] ^ b[2]; 205 res |= a[3] ^ b[3]; 206 if (P256_LIMBS == 8) { 207 res |= a[4] ^ b[4]; 208 res |= a[5] ^ b[5]; 209 res |= a[6] ^ b[6]; 210 res |= a[7] ^ b[7]; 211 } 212 213 return is_zero(res); 214 } 215 216 static BN_ULONG is_one(const BIGNUM *z) 217 { 218 BN_ULONG res = 0; 219 BN_ULONG *a = bn_get_words(z); 220 221 if (bn_get_top(z) == (P256_LIMBS - P256_LIMBS / 8)) { 222 res = a[0] ^ ONE[0]; 223 res |= a[1] ^ ONE[1]; 224 res |= a[2] ^ ONE[2]; 225 res |= a[3] ^ ONE[3]; 226 if (P256_LIMBS == 8) { 227 res |= a[4] ^ ONE[4]; 228 res |= a[5] ^ ONE[5]; 229 res |= a[6] ^ ONE[6]; 230 /* 231 * no check for a[7] (being zero) on 32-bit platforms, 232 * because value of "one" takes only 7 limbs. 233 */ 234 } 235 res = is_zero(res); 236 } 237 238 return res; 239 } 240 241 /* 242 * For reference, this macro is used only when new ecp_nistz256 assembly 243 * module is being developed. For example, configure with 244 * -DECP_NISTZ256_REFERENCE_IMPLEMENTATION and implement only functions 245 * performing simplest arithmetic operations on 256-bit vectors. Then 246 * work on implementation of higher-level functions performing point 247 * operations. Then remove ECP_NISTZ256_REFERENCE_IMPLEMENTATION 248 * and never define it again. (The correct macro denoting presence of 249 * ecp_nistz256 module is ECP_NISTZ256_ASM.) 250 */ 251 #ifndef ECP_NISTZ256_REFERENCE_IMPLEMENTATION 252 void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a); 253 void ecp_nistz256_point_add(P256_POINT *r, 254 const P256_POINT *a, const P256_POINT *b); 255 void ecp_nistz256_point_add_affine(P256_POINT *r, 256 const P256_POINT *a, 257 const P256_POINT_AFFINE *b); 258 #else 259 /* Point double: r = 2*a */ 260 static void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) 261 { 262 BN_ULONG S[P256_LIMBS]; 263 BN_ULONG M[P256_LIMBS]; 264 BN_ULONG Zsqr[P256_LIMBS]; 265 BN_ULONG tmp0[P256_LIMBS]; 266 267 const BN_ULONG *in_x = a->X; 268 const BN_ULONG *in_y = a->Y; 269 const BN_ULONG *in_z = a->Z; 270 271 BN_ULONG *res_x = r->X; 272 BN_ULONG *res_y = r->Y; 273 BN_ULONG *res_z = r->Z; 274 275 ecp_nistz256_mul_by_2(S, in_y); 276 277 ecp_nistz256_sqr_mont(Zsqr, in_z); 278 279 ecp_nistz256_sqr_mont(S, S); 280 281 ecp_nistz256_mul_mont(res_z, in_z, in_y); 282 ecp_nistz256_mul_by_2(res_z, res_z); 283 284 ecp_nistz256_add(M, in_x, Zsqr); 285 ecp_nistz256_sub(Zsqr, in_x, Zsqr); 286 287 ecp_nistz256_sqr_mont(res_y, S); 288 ecp_nistz256_div_by_2(res_y, res_y); 289 290 ecp_nistz256_mul_mont(M, M, Zsqr); 291 ecp_nistz256_mul_by_3(M, M); 292 293 ecp_nistz256_mul_mont(S, S, in_x); 294 ecp_nistz256_mul_by_2(tmp0, S); 295 296 ecp_nistz256_sqr_mont(res_x, M); 297 298 ecp_nistz256_sub(res_x, res_x, tmp0); 299 ecp_nistz256_sub(S, S, res_x); 300 301 ecp_nistz256_mul_mont(S, S, M); 302 ecp_nistz256_sub(res_y, S, res_y); 303 } 304 305 /* Point addition: r = a+b */ 306 static void ecp_nistz256_point_add(P256_POINT *r, 307 const P256_POINT *a, const P256_POINT *b) 308 { 309 BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS]; 310 BN_ULONG U1[P256_LIMBS], S1[P256_LIMBS]; 311 BN_ULONG Z1sqr[P256_LIMBS]; 312 BN_ULONG Z2sqr[P256_LIMBS]; 313 BN_ULONG H[P256_LIMBS], R[P256_LIMBS]; 314 BN_ULONG Hsqr[P256_LIMBS]; 315 BN_ULONG Rsqr[P256_LIMBS]; 316 BN_ULONG Hcub[P256_LIMBS]; 317 318 BN_ULONG res_x[P256_LIMBS]; 319 BN_ULONG res_y[P256_LIMBS]; 320 BN_ULONG res_z[P256_LIMBS]; 321 322 BN_ULONG in1infty, in2infty; 323 324 const BN_ULONG *in1_x = a->X; 325 const BN_ULONG *in1_y = a->Y; 326 const BN_ULONG *in1_z = a->Z; 327 328 const BN_ULONG *in2_x = b->X; 329 const BN_ULONG *in2_y = b->Y; 330 const BN_ULONG *in2_z = b->Z; 331 332 /* 333 * Infinity in encoded as (,,0) 334 */ 335 in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]); 336 if (P256_LIMBS == 8) 337 in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]); 338 339 in2infty = (in2_z[0] | in2_z[1] | in2_z[2] | in2_z[3]); 340 if (P256_LIMBS == 8) 341 in2infty |= (in2_z[4] | in2_z[5] | in2_z[6] | in2_z[7]); 342 343 in1infty = is_zero(in1infty); 344 in2infty = is_zero(in2infty); 345 346 ecp_nistz256_sqr_mont(Z2sqr, in2_z); /* Z2^2 */ 347 ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ 348 349 ecp_nistz256_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */ 350 ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ 351 352 ecp_nistz256_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */ 353 ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ 354 ecp_nistz256_sub(R, S2, S1); /* R = S2 - S1 */ 355 356 ecp_nistz256_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */ 357 ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ 358 ecp_nistz256_sub(H, U2, U1); /* H = U2 - U1 */ 359 360 /* 361 * The formulae are incorrect if the points are equal so we check for 362 * this and do doubling if this happens. 363 * 364 * Points here are in Jacobian projective coordinates (Xi, Yi, Zi) 365 * that are bound to the affine coordinates (xi, yi) by the following 366 * equations: 367 * - xi = Xi / (Zi)^2 368 * - y1 = Yi / (Zi)^3 369 * 370 * For the sake of optimization, the algorithm operates over 371 * intermediate variables U1, U2 and S1, S2 that are derived from 372 * the projective coordinates: 373 * - U1 = X1 * (Z2)^2 ; U2 = X2 * (Z1)^2 374 * - S1 = Y1 * (Z2)^3 ; S2 = Y2 * (Z1)^3 375 * 376 * It is easy to prove that is_equal(U1, U2) implies that the affine 377 * x-coordinates are equal, or either point is at infinity. 378 * Likewise is_equal(S1, S2) implies that the affine y-coordinates are 379 * equal, or either point is at infinity. 380 * 381 * The special case of either point being the point at infinity (Z1 or Z2 382 * is zero), is handled separately later on in this function, so we avoid 383 * jumping to point_double here in those special cases. 384 * 385 * When both points are inverse of each other, we know that the affine 386 * x-coordinates are equal, and the y-coordinates have different sign. 387 * Therefore since U1 = U2, we know H = 0, and therefore Z3 = H*Z1*Z2 388 * will equal 0, thus the result is infinity, if we simply let this 389 * function continue normally. 390 * 391 * We use bitwise operations to avoid potential side-channels introduced by 392 * the short-circuiting behaviour of boolean operators. 393 */ 394 if (is_equal(U1, U2) & ~in1infty & ~in2infty & is_equal(S1, S2)) { 395 /* 396 * This is obviously not constant-time but it should never happen during 397 * single point multiplication, so there is no timing leak for ECDH or 398 * ECDSA signing. 399 */ 400 ecp_nistz256_point_double(r, a); 401 return; 402 } 403 404 ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */ 405 ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ 406 ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */ 407 ecp_nistz256_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */ 408 ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */ 409 410 ecp_nistz256_mul_mont(U2, U1, Hsqr); /* U1*H^2 */ 411 ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ 412 413 ecp_nistz256_sub(res_x, Rsqr, Hsqr); 414 ecp_nistz256_sub(res_x, res_x, Hcub); 415 416 ecp_nistz256_sub(res_y, U2, res_x); 417 418 ecp_nistz256_mul_mont(S2, S1, Hcub); 419 ecp_nistz256_mul_mont(res_y, R, res_y); 420 ecp_nistz256_sub(res_y, res_y, S2); 421 422 copy_conditional(res_x, in2_x, in1infty); 423 copy_conditional(res_y, in2_y, in1infty); 424 copy_conditional(res_z, in2_z, in1infty); 425 426 copy_conditional(res_x, in1_x, in2infty); 427 copy_conditional(res_y, in1_y, in2infty); 428 copy_conditional(res_z, in1_z, in2infty); 429 430 memcpy(r->X, res_x, sizeof(res_x)); 431 memcpy(r->Y, res_y, sizeof(res_y)); 432 memcpy(r->Z, res_z, sizeof(res_z)); 433 } 434 435 /* Point addition when b is known to be affine: r = a+b */ 436 static void ecp_nistz256_point_add_affine(P256_POINT *r, 437 const P256_POINT *a, 438 const P256_POINT_AFFINE *b) 439 { 440 BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS]; 441 BN_ULONG Z1sqr[P256_LIMBS]; 442 BN_ULONG H[P256_LIMBS], R[P256_LIMBS]; 443 BN_ULONG Hsqr[P256_LIMBS]; 444 BN_ULONG Rsqr[P256_LIMBS]; 445 BN_ULONG Hcub[P256_LIMBS]; 446 447 BN_ULONG res_x[P256_LIMBS]; 448 BN_ULONG res_y[P256_LIMBS]; 449 BN_ULONG res_z[P256_LIMBS]; 450 451 BN_ULONG in1infty, in2infty; 452 453 const BN_ULONG *in1_x = a->X; 454 const BN_ULONG *in1_y = a->Y; 455 const BN_ULONG *in1_z = a->Z; 456 457 const BN_ULONG *in2_x = b->X; 458 const BN_ULONG *in2_y = b->Y; 459 460 /* 461 * Infinity in encoded as (,,0) 462 */ 463 in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]); 464 if (P256_LIMBS == 8) 465 in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]); 466 467 /* 468 * In affine representation we encode infinity as (0,0), which is 469 * not on the curve, so it is OK 470 */ 471 in2infty = (in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] | 472 in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3]); 473 if (P256_LIMBS == 8) 474 in2infty |= (in2_x[4] | in2_x[5] | in2_x[6] | in2_x[7] | 475 in2_y[4] | in2_y[5] | in2_y[6] | in2_y[7]); 476 477 in1infty = is_zero(in1infty); 478 in2infty = is_zero(in2infty); 479 480 ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ 481 482 ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ 483 ecp_nistz256_sub(H, U2, in1_x); /* H = U2 - U1 */ 484 485 ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ 486 487 ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ 488 489 ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ 490 ecp_nistz256_sub(R, S2, in1_y); /* R = S2 - S1 */ 491 492 ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */ 493 ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */ 494 ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */ 495 496 ecp_nistz256_mul_mont(U2, in1_x, Hsqr); /* U1*H^2 */ 497 ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ 498 499 ecp_nistz256_sub(res_x, Rsqr, Hsqr); 500 ecp_nistz256_sub(res_x, res_x, Hcub); 501 ecp_nistz256_sub(H, U2, res_x); 502 503 ecp_nistz256_mul_mont(S2, in1_y, Hcub); 504 ecp_nistz256_mul_mont(H, H, R); 505 ecp_nistz256_sub(res_y, H, S2); 506 507 copy_conditional(res_x, in2_x, in1infty); 508 copy_conditional(res_x, in1_x, in2infty); 509 510 copy_conditional(res_y, in2_y, in1infty); 511 copy_conditional(res_y, in1_y, in2infty); 512 513 copy_conditional(res_z, ONE, in1infty); 514 copy_conditional(res_z, in1_z, in2infty); 515 516 memcpy(r->X, res_x, sizeof(res_x)); 517 memcpy(r->Y, res_y, sizeof(res_y)); 518 memcpy(r->Z, res_z, sizeof(res_z)); 519 } 520 #endif 521 522 /* r = in^-1 mod p */ 523 static void ecp_nistz256_mod_inverse(BN_ULONG r[P256_LIMBS], 524 const BN_ULONG in[P256_LIMBS]) 525 { 526 /* 527 * The poly is ffffffff 00000001 00000000 00000000 00000000 ffffffff 528 * ffffffff ffffffff We use FLT and used poly-2 as exponent 529 */ 530 BN_ULONG p2[P256_LIMBS]; 531 BN_ULONG p4[P256_LIMBS]; 532 BN_ULONG p8[P256_LIMBS]; 533 BN_ULONG p16[P256_LIMBS]; 534 BN_ULONG p32[P256_LIMBS]; 535 BN_ULONG res[P256_LIMBS]; 536 int i; 537 538 ecp_nistz256_sqr_mont(res, in); 539 ecp_nistz256_mul_mont(p2, res, in); /* 3*p */ 540 541 ecp_nistz256_sqr_mont(res, p2); 542 ecp_nistz256_sqr_mont(res, res); 543 ecp_nistz256_mul_mont(p4, res, p2); /* f*p */ 544 545 ecp_nistz256_sqr_mont(res, p4); 546 ecp_nistz256_sqr_mont(res, res); 547 ecp_nistz256_sqr_mont(res, res); 548 ecp_nistz256_sqr_mont(res, res); 549 ecp_nistz256_mul_mont(p8, res, p4); /* ff*p */ 550 551 ecp_nistz256_sqr_mont(res, p8); 552 for (i = 0; i < 7; i++) 553 ecp_nistz256_sqr_mont(res, res); 554 ecp_nistz256_mul_mont(p16, res, p8); /* ffff*p */ 555 556 ecp_nistz256_sqr_mont(res, p16); 557 for (i = 0; i < 15; i++) 558 ecp_nistz256_sqr_mont(res, res); 559 ecp_nistz256_mul_mont(p32, res, p16); /* ffffffff*p */ 560 561 ecp_nistz256_sqr_mont(res, p32); 562 for (i = 0; i < 31; i++) 563 ecp_nistz256_sqr_mont(res, res); 564 ecp_nistz256_mul_mont(res, res, in); 565 566 for (i = 0; i < 32 * 4; i++) 567 ecp_nistz256_sqr_mont(res, res); 568 ecp_nistz256_mul_mont(res, res, p32); 569 570 for (i = 0; i < 32; i++) 571 ecp_nistz256_sqr_mont(res, res); 572 ecp_nistz256_mul_mont(res, res, p32); 573 574 for (i = 0; i < 16; i++) 575 ecp_nistz256_sqr_mont(res, res); 576 ecp_nistz256_mul_mont(res, res, p16); 577 578 for (i = 0; i < 8; i++) 579 ecp_nistz256_sqr_mont(res, res); 580 ecp_nistz256_mul_mont(res, res, p8); 581 582 ecp_nistz256_sqr_mont(res, res); 583 ecp_nistz256_sqr_mont(res, res); 584 ecp_nistz256_sqr_mont(res, res); 585 ecp_nistz256_sqr_mont(res, res); 586 ecp_nistz256_mul_mont(res, res, p4); 587 588 ecp_nistz256_sqr_mont(res, res); 589 ecp_nistz256_sqr_mont(res, res); 590 ecp_nistz256_mul_mont(res, res, p2); 591 592 ecp_nistz256_sqr_mont(res, res); 593 ecp_nistz256_sqr_mont(res, res); 594 ecp_nistz256_mul_mont(res, res, in); 595 596 memcpy(r, res, sizeof(res)); 597 } 598 599 /* 600 * ecp_nistz256_bignum_to_field_elem copies the contents of |in| to |out| and 601 * returns one if it fits. Otherwise it returns zero. 602 */ 603 __owur static int ecp_nistz256_bignum_to_field_elem(BN_ULONG out[P256_LIMBS], 604 const BIGNUM *in) 605 { 606 return bn_copy_words(out, in, P256_LIMBS); 607 } 608 609 /* r = sum(scalar[i]*point[i]) */ 610 __owur static int ecp_nistz256_windowed_mul(const EC_GROUP *group, 611 P256_POINT *r, 612 const BIGNUM **scalar, 613 const EC_POINT **point, 614 size_t num, BN_CTX *ctx) 615 { 616 size_t i; 617 int j, ret = 0; 618 unsigned int idx; 619 unsigned char (*p_str)[33] = NULL; 620 const unsigned int window_size = 5; 621 const unsigned int mask = (1 << (window_size + 1)) - 1; 622 unsigned int wvalue; 623 P256_POINT *temp; /* place for 5 temporary points */ 624 const BIGNUM **scalars = NULL; 625 P256_POINT (*table)[16] = NULL; 626 void *table_storage = NULL; 627 628 if ((num * 16 + 6) > OPENSSL_MALLOC_MAX_NELEMS(P256_POINT) 629 || (table_storage = 630 OPENSSL_malloc((num * 16 + 5) * sizeof(P256_POINT) + 64)) == NULL 631 || (p_str = 632 OPENSSL_malloc(num * 33 * sizeof(unsigned char))) == NULL 633 || (scalars = OPENSSL_malloc(num * sizeof(BIGNUM *))) == NULL) { 634 ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_MALLOC_FAILURE); 635 goto err; 636 } 637 638 table = (void *)ALIGNPTR(table_storage, 64); 639 temp = (P256_POINT *)(table + num); 640 641 for (i = 0; i < num; i++) { 642 P256_POINT *row = table[i]; 643 644 /* This is an unusual input, we don't guarantee constant-timeness. */ 645 if ((BN_num_bits(scalar[i]) > 256) || BN_is_negative(scalar[i])) { 646 BIGNUM *mod; 647 648 if ((mod = BN_CTX_get(ctx)) == NULL) 649 goto err; 650 if (!BN_nnmod(mod, scalar[i], group->order, ctx)) { 651 ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_BN_LIB); 652 goto err; 653 } 654 scalars[i] = mod; 655 } else 656 scalars[i] = scalar[i]; 657 658 for (j = 0; j < bn_get_top(scalars[i]) * BN_BYTES; j += BN_BYTES) { 659 BN_ULONG d = bn_get_words(scalars[i])[j / BN_BYTES]; 660 661 p_str[i][j + 0] = (unsigned char)d; 662 p_str[i][j + 1] = (unsigned char)(d >> 8); 663 p_str[i][j + 2] = (unsigned char)(d >> 16); 664 p_str[i][j + 3] = (unsigned char)(d >>= 24); 665 if (BN_BYTES == 8) { 666 d >>= 8; 667 p_str[i][j + 4] = (unsigned char)d; 668 p_str[i][j + 5] = (unsigned char)(d >> 8); 669 p_str[i][j + 6] = (unsigned char)(d >> 16); 670 p_str[i][j + 7] = (unsigned char)(d >> 24); 671 } 672 } 673 for (; j < 33; j++) 674 p_str[i][j] = 0; 675 676 if (!ecp_nistz256_bignum_to_field_elem(temp[0].X, point[i]->X) 677 || !ecp_nistz256_bignum_to_field_elem(temp[0].Y, point[i]->Y) 678 || !ecp_nistz256_bignum_to_field_elem(temp[0].Z, point[i]->Z)) { 679 ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, 680 EC_R_COORDINATES_OUT_OF_RANGE); 681 goto err; 682 } 683 684 /* 685 * row[0] is implicitly (0,0,0) (the point at infinity), therefore it 686 * is not stored. All other values are actually stored with an offset 687 * of -1 in table. 688 */ 689 690 ecp_nistz256_scatter_w5 (row, &temp[0], 1); 691 ecp_nistz256_point_double(&temp[1], &temp[0]); /*1+1=2 */ 692 ecp_nistz256_scatter_w5 (row, &temp[1], 2); 693 ecp_nistz256_point_add (&temp[2], &temp[1], &temp[0]); /*2+1=3 */ 694 ecp_nistz256_scatter_w5 (row, &temp[2], 3); 695 ecp_nistz256_point_double(&temp[1], &temp[1]); /*2*2=4 */ 696 ecp_nistz256_scatter_w5 (row, &temp[1], 4); 697 ecp_nistz256_point_double(&temp[2], &temp[2]); /*2*3=6 */ 698 ecp_nistz256_scatter_w5 (row, &temp[2], 6); 699 ecp_nistz256_point_add (&temp[3], &temp[1], &temp[0]); /*4+1=5 */ 700 ecp_nistz256_scatter_w5 (row, &temp[3], 5); 701 ecp_nistz256_point_add (&temp[4], &temp[2], &temp[0]); /*6+1=7 */ 702 ecp_nistz256_scatter_w5 (row, &temp[4], 7); 703 ecp_nistz256_point_double(&temp[1], &temp[1]); /*2*4=8 */ 704 ecp_nistz256_scatter_w5 (row, &temp[1], 8); 705 ecp_nistz256_point_double(&temp[2], &temp[2]); /*2*6=12 */ 706 ecp_nistz256_scatter_w5 (row, &temp[2], 12); 707 ecp_nistz256_point_double(&temp[3], &temp[3]); /*2*5=10 */ 708 ecp_nistz256_scatter_w5 (row, &temp[3], 10); 709 ecp_nistz256_point_double(&temp[4], &temp[4]); /*2*7=14 */ 710 ecp_nistz256_scatter_w5 (row, &temp[4], 14); 711 ecp_nistz256_point_add (&temp[2], &temp[2], &temp[0]); /*12+1=13*/ 712 ecp_nistz256_scatter_w5 (row, &temp[2], 13); 713 ecp_nistz256_point_add (&temp[3], &temp[3], &temp[0]); /*10+1=11*/ 714 ecp_nistz256_scatter_w5 (row, &temp[3], 11); 715 ecp_nistz256_point_add (&temp[4], &temp[4], &temp[0]); /*14+1=15*/ 716 ecp_nistz256_scatter_w5 (row, &temp[4], 15); 717 ecp_nistz256_point_add (&temp[2], &temp[1], &temp[0]); /*8+1=9 */ 718 ecp_nistz256_scatter_w5 (row, &temp[2], 9); 719 ecp_nistz256_point_double(&temp[1], &temp[1]); /*2*8=16 */ 720 ecp_nistz256_scatter_w5 (row, &temp[1], 16); 721 } 722 723 idx = 255; 724 725 wvalue = p_str[0][(idx - 1) / 8]; 726 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 727 728 /* 729 * We gather to temp[0], because we know it's position relative 730 * to table 731 */ 732 ecp_nistz256_gather_w5(&temp[0], table[0], _booth_recode_w5(wvalue) >> 1); 733 memcpy(r, &temp[0], sizeof(temp[0])); 734 735 while (idx >= 5) { 736 for (i = (idx == 255 ? 1 : 0); i < num; i++) { 737 unsigned int off = (idx - 1) / 8; 738 739 wvalue = p_str[i][off] | p_str[i][off + 1] << 8; 740 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 741 742 wvalue = _booth_recode_w5(wvalue); 743 744 ecp_nistz256_gather_w5(&temp[0], table[i], wvalue >> 1); 745 746 ecp_nistz256_neg(temp[1].Y, temp[0].Y); 747 copy_conditional(temp[0].Y, temp[1].Y, (wvalue & 1)); 748 749 ecp_nistz256_point_add(r, r, &temp[0]); 750 } 751 752 idx -= window_size; 753 754 ecp_nistz256_point_double(r, r); 755 ecp_nistz256_point_double(r, r); 756 ecp_nistz256_point_double(r, r); 757 ecp_nistz256_point_double(r, r); 758 ecp_nistz256_point_double(r, r); 759 } 760 761 /* Final window */ 762 for (i = 0; i < num; i++) { 763 wvalue = p_str[i][0]; 764 wvalue = (wvalue << 1) & mask; 765 766 wvalue = _booth_recode_w5(wvalue); 767 768 ecp_nistz256_gather_w5(&temp[0], table[i], wvalue >> 1); 769 770 ecp_nistz256_neg(temp[1].Y, temp[0].Y); 771 copy_conditional(temp[0].Y, temp[1].Y, wvalue & 1); 772 773 ecp_nistz256_point_add(r, r, &temp[0]); 774 } 775 776 ret = 1; 777 err: 778 OPENSSL_free(table_storage); 779 OPENSSL_free(p_str); 780 OPENSSL_free(scalars); 781 return ret; 782 } 783 784 /* Coordinates of G, for which we have precomputed tables */ 785 static const BN_ULONG def_xG[P256_LIMBS] = { 786 TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601), 787 TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6) 788 }; 789 790 static const BN_ULONG def_yG[P256_LIMBS] = { 791 TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c), 792 TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85) 793 }; 794 795 /* 796 * ecp_nistz256_is_affine_G returns one if |generator| is the standard, P-256 797 * generator. 798 */ 799 static int ecp_nistz256_is_affine_G(const EC_POINT *generator) 800 { 801 return (bn_get_top(generator->X) == P256_LIMBS) && 802 (bn_get_top(generator->Y) == P256_LIMBS) && 803 is_equal(bn_get_words(generator->X), def_xG) && 804 is_equal(bn_get_words(generator->Y), def_yG) && 805 is_one(generator->Z); 806 } 807 808 __owur static int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx) 809 { 810 /* 811 * We precompute a table for a Booth encoded exponent (wNAF) based 812 * computation. Each table holds 64 values for safe access, with an 813 * implicit value of infinity at index zero. We use window of size 7, and 814 * therefore require ceil(256/7) = 37 tables. 815 */ 816 const BIGNUM *order; 817 EC_POINT *P = NULL, *T = NULL; 818 const EC_POINT *generator; 819 NISTZ256_PRE_COMP *pre_comp; 820 BN_CTX *new_ctx = NULL; 821 int i, j, k, ret = 0; 822 size_t w; 823 824 PRECOMP256_ROW *preComputedTable = NULL; 825 unsigned char *precomp_storage = NULL; 826 827 /* if there is an old NISTZ256_PRE_COMP object, throw it away */ 828 EC_pre_comp_free(group); 829 generator = EC_GROUP_get0_generator(group); 830 if (generator == NULL) { 831 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNDEFINED_GENERATOR); 832 return 0; 833 } 834 835 if (ecp_nistz256_is_affine_G(generator)) { 836 /* 837 * No need to calculate tables for the standard generator because we 838 * have them statically. 839 */ 840 return 1; 841 } 842 843 if ((pre_comp = ecp_nistz256_pre_comp_new(group)) == NULL) 844 return 0; 845 846 if (ctx == NULL) { 847 ctx = new_ctx = BN_CTX_new(); 848 if (ctx == NULL) 849 goto err; 850 } 851 852 BN_CTX_start(ctx); 853 854 order = EC_GROUP_get0_order(group); 855 if (order == NULL) 856 goto err; 857 858 if (BN_is_zero(order)) { 859 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNKNOWN_ORDER); 860 goto err; 861 } 862 863 w = 7; 864 865 if ((precomp_storage = 866 OPENSSL_malloc(37 * 64 * sizeof(P256_POINT_AFFINE) + 64)) == NULL) { 867 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, ERR_R_MALLOC_FAILURE); 868 goto err; 869 } 870 871 preComputedTable = (void *)ALIGNPTR(precomp_storage, 64); 872 873 P = EC_POINT_new(group); 874 T = EC_POINT_new(group); 875 if (P == NULL || T == NULL) 876 goto err; 877 878 /* 879 * The zero entry is implicitly infinity, and we skip it, storing other 880 * values with -1 offset. 881 */ 882 if (!EC_POINT_copy(T, generator)) 883 goto err; 884 885 for (k = 0; k < 64; k++) { 886 if (!EC_POINT_copy(P, T)) 887 goto err; 888 for (j = 0; j < 37; j++) { 889 P256_POINT_AFFINE temp; 890 /* 891 * It would be faster to use EC_POINTs_make_affine and 892 * make multiple points affine at the same time. 893 */ 894 if (!EC_POINT_make_affine(group, P, ctx)) 895 goto err; 896 if (!ecp_nistz256_bignum_to_field_elem(temp.X, P->X) || 897 !ecp_nistz256_bignum_to_field_elem(temp.Y, P->Y)) { 898 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, 899 EC_R_COORDINATES_OUT_OF_RANGE); 900 goto err; 901 } 902 ecp_nistz256_scatter_w7(preComputedTable[j], &temp, k); 903 for (i = 0; i < 7; i++) { 904 if (!EC_POINT_dbl(group, P, P, ctx)) 905 goto err; 906 } 907 } 908 if (!EC_POINT_add(group, T, T, generator, ctx)) 909 goto err; 910 } 911 912 pre_comp->group = group; 913 pre_comp->w = w; 914 pre_comp->precomp = preComputedTable; 915 pre_comp->precomp_storage = precomp_storage; 916 precomp_storage = NULL; 917 SETPRECOMP(group, nistz256, pre_comp); 918 pre_comp = NULL; 919 ret = 1; 920 921 err: 922 BN_CTX_end(ctx); 923 BN_CTX_free(new_ctx); 924 925 EC_nistz256_pre_comp_free(pre_comp); 926 OPENSSL_free(precomp_storage); 927 EC_POINT_free(P); 928 EC_POINT_free(T); 929 return ret; 930 } 931 932 __owur static int ecp_nistz256_set_from_affine(EC_POINT *out, const EC_GROUP *group, 933 const P256_POINT_AFFINE *in, 934 BN_CTX *ctx) 935 { 936 int ret = 0; 937 938 if ((ret = bn_set_words(out->X, in->X, P256_LIMBS)) 939 && (ret = bn_set_words(out->Y, in->Y, P256_LIMBS)) 940 && (ret = bn_set_words(out->Z, ONE, P256_LIMBS))) 941 out->Z_is_one = 1; 942 943 return ret; 944 } 945 946 /* r = scalar*G + sum(scalars[i]*points[i]) */ 947 __owur static int ecp_nistz256_points_mul(const EC_GROUP *group, 948 EC_POINT *r, 949 const BIGNUM *scalar, 950 size_t num, 951 const EC_POINT *points[], 952 const BIGNUM *scalars[], BN_CTX *ctx) 953 { 954 int i = 0, ret = 0, no_precomp_for_generator = 0, p_is_infinity = 0; 955 unsigned char p_str[33] = { 0 }; 956 const PRECOMP256_ROW *preComputedTable = NULL; 957 const NISTZ256_PRE_COMP *pre_comp = NULL; 958 const EC_POINT *generator = NULL; 959 const BIGNUM **new_scalars = NULL; 960 const EC_POINT **new_points = NULL; 961 unsigned int idx = 0; 962 const unsigned int window_size = 7; 963 const unsigned int mask = (1 << (window_size + 1)) - 1; 964 unsigned int wvalue; 965 ALIGN32 union { 966 P256_POINT p; 967 P256_POINT_AFFINE a; 968 } t, p; 969 BIGNUM *tmp_scalar; 970 971 if ((num + 1) == 0 || (num + 1) > OPENSSL_MALLOC_MAX_NELEMS(void *)) { 972 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 973 return 0; 974 } 975 976 memset(&p, 0, sizeof(p)); 977 BN_CTX_start(ctx); 978 979 if (scalar) { 980 generator = EC_GROUP_get0_generator(group); 981 if (generator == NULL) { 982 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_UNDEFINED_GENERATOR); 983 goto err; 984 } 985 986 /* look if we can use precomputed multiples of generator */ 987 pre_comp = group->pre_comp.nistz256; 988 989 if (pre_comp) { 990 /* 991 * If there is a precomputed table for the generator, check that 992 * it was generated with the same generator. 993 */ 994 EC_POINT *pre_comp_generator = EC_POINT_new(group); 995 if (pre_comp_generator == NULL) 996 goto err; 997 998 ecp_nistz256_gather_w7(&p.a, pre_comp->precomp[0], 1); 999 if (!ecp_nistz256_set_from_affine(pre_comp_generator, 1000 group, &p.a, ctx)) { 1001 EC_POINT_free(pre_comp_generator); 1002 goto err; 1003 } 1004 1005 if (0 == EC_POINT_cmp(group, generator, pre_comp_generator, ctx)) 1006 preComputedTable = (const PRECOMP256_ROW *)pre_comp->precomp; 1007 1008 EC_POINT_free(pre_comp_generator); 1009 } 1010 1011 if (preComputedTable == NULL && ecp_nistz256_is_affine_G(generator)) { 1012 /* 1013 * If there is no precomputed data, but the generator is the 1014 * default, a hardcoded table of precomputed data is used. This 1015 * is because applications, such as Apache, do not use 1016 * EC_KEY_precompute_mult. 1017 */ 1018 preComputedTable = ecp_nistz256_precomputed; 1019 } 1020 1021 if (preComputedTable) { 1022 BN_ULONG infty; 1023 1024 if ((BN_num_bits(scalar) > 256) 1025 || BN_is_negative(scalar)) { 1026 if ((tmp_scalar = BN_CTX_get(ctx)) == NULL) 1027 goto err; 1028 1029 if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) { 1030 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_BN_LIB); 1031 goto err; 1032 } 1033 scalar = tmp_scalar; 1034 } 1035 1036 for (i = 0; i < bn_get_top(scalar) * BN_BYTES; i += BN_BYTES) { 1037 BN_ULONG d = bn_get_words(scalar)[i / BN_BYTES]; 1038 1039 p_str[i + 0] = (unsigned char)d; 1040 p_str[i + 1] = (unsigned char)(d >> 8); 1041 p_str[i + 2] = (unsigned char)(d >> 16); 1042 p_str[i + 3] = (unsigned char)(d >>= 24); 1043 if (BN_BYTES == 8) { 1044 d >>= 8; 1045 p_str[i + 4] = (unsigned char)d; 1046 p_str[i + 5] = (unsigned char)(d >> 8); 1047 p_str[i + 6] = (unsigned char)(d >> 16); 1048 p_str[i + 7] = (unsigned char)(d >> 24); 1049 } 1050 } 1051 1052 for (; i < 33; i++) 1053 p_str[i] = 0; 1054 1055 /* First window */ 1056 wvalue = (p_str[0] << 1) & mask; 1057 idx += window_size; 1058 1059 wvalue = _booth_recode_w7(wvalue); 1060 1061 ecp_nistz256_gather_w7(&p.a, preComputedTable[0], 1062 wvalue >> 1); 1063 1064 ecp_nistz256_neg(p.p.Z, p.p.Y); 1065 copy_conditional(p.p.Y, p.p.Z, wvalue & 1); 1066 1067 /* 1068 * Since affine infinity is encoded as (0,0) and 1069 * Jacobian is (,,0), we need to harmonize them 1070 * by assigning "one" or zero to Z. 1071 */ 1072 infty = (p.p.X[0] | p.p.X[1] | p.p.X[2] | p.p.X[3] | 1073 p.p.Y[0] | p.p.Y[1] | p.p.Y[2] | p.p.Y[3]); 1074 if (P256_LIMBS == 8) 1075 infty |= (p.p.X[4] | p.p.X[5] | p.p.X[6] | p.p.X[7] | 1076 p.p.Y[4] | p.p.Y[5] | p.p.Y[6] | p.p.Y[7]); 1077 1078 infty = 0 - is_zero(infty); 1079 infty = ~infty; 1080 1081 p.p.Z[0] = ONE[0] & infty; 1082 p.p.Z[1] = ONE[1] & infty; 1083 p.p.Z[2] = ONE[2] & infty; 1084 p.p.Z[3] = ONE[3] & infty; 1085 if (P256_LIMBS == 8) { 1086 p.p.Z[4] = ONE[4] & infty; 1087 p.p.Z[5] = ONE[5] & infty; 1088 p.p.Z[6] = ONE[6] & infty; 1089 p.p.Z[7] = ONE[7] & infty; 1090 } 1091 1092 for (i = 1; i < 37; i++) { 1093 unsigned int off = (idx - 1) / 8; 1094 wvalue = p_str[off] | p_str[off + 1] << 8; 1095 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1096 idx += window_size; 1097 1098 wvalue = _booth_recode_w7(wvalue); 1099 1100 ecp_nistz256_gather_w7(&t.a, 1101 preComputedTable[i], wvalue >> 1); 1102 1103 ecp_nistz256_neg(t.p.Z, t.a.Y); 1104 copy_conditional(t.a.Y, t.p.Z, wvalue & 1); 1105 1106 ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a); 1107 } 1108 } else { 1109 p_is_infinity = 1; 1110 no_precomp_for_generator = 1; 1111 } 1112 } else 1113 p_is_infinity = 1; 1114 1115 if (no_precomp_for_generator) { 1116 /* 1117 * Without a precomputed table for the generator, it has to be 1118 * handled like a normal point. 1119 */ 1120 new_scalars = OPENSSL_malloc((num + 1) * sizeof(BIGNUM *)); 1121 if (new_scalars == NULL) { 1122 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 1123 goto err; 1124 } 1125 1126 new_points = OPENSSL_malloc((num + 1) * sizeof(EC_POINT *)); 1127 if (new_points == NULL) { 1128 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 1129 goto err; 1130 } 1131 1132 memcpy(new_scalars, scalars, num * sizeof(BIGNUM *)); 1133 new_scalars[num] = scalar; 1134 memcpy(new_points, points, num * sizeof(EC_POINT *)); 1135 new_points[num] = generator; 1136 1137 scalars = new_scalars; 1138 points = new_points; 1139 num++; 1140 } 1141 1142 if (num) { 1143 P256_POINT *out = &t.p; 1144 if (p_is_infinity) 1145 out = &p.p; 1146 1147 if (!ecp_nistz256_windowed_mul(group, out, scalars, points, num, ctx)) 1148 goto err; 1149 1150 if (!p_is_infinity) 1151 ecp_nistz256_point_add(&p.p, &p.p, out); 1152 } 1153 1154 /* Not constant-time, but we're only operating on the public output. */ 1155 if (!bn_set_words(r->X, p.p.X, P256_LIMBS) || 1156 !bn_set_words(r->Y, p.p.Y, P256_LIMBS) || 1157 !bn_set_words(r->Z, p.p.Z, P256_LIMBS)) { 1158 goto err; 1159 } 1160 r->Z_is_one = is_one(r->Z) & 1; 1161 1162 ret = 1; 1163 1164 err: 1165 BN_CTX_end(ctx); 1166 OPENSSL_free(new_points); 1167 OPENSSL_free(new_scalars); 1168 return ret; 1169 } 1170 1171 __owur static int ecp_nistz256_get_affine(const EC_GROUP *group, 1172 const EC_POINT *point, 1173 BIGNUM *x, BIGNUM *y, BN_CTX *ctx) 1174 { 1175 BN_ULONG z_inv2[P256_LIMBS]; 1176 BN_ULONG z_inv3[P256_LIMBS]; 1177 BN_ULONG x_aff[P256_LIMBS]; 1178 BN_ULONG y_aff[P256_LIMBS]; 1179 BN_ULONG point_x[P256_LIMBS], point_y[P256_LIMBS], point_z[P256_LIMBS]; 1180 BN_ULONG x_ret[P256_LIMBS], y_ret[P256_LIMBS]; 1181 1182 if (EC_POINT_is_at_infinity(group, point)) { 1183 ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_POINT_AT_INFINITY); 1184 return 0; 1185 } 1186 1187 if (!ecp_nistz256_bignum_to_field_elem(point_x, point->X) || 1188 !ecp_nistz256_bignum_to_field_elem(point_y, point->Y) || 1189 !ecp_nistz256_bignum_to_field_elem(point_z, point->Z)) { 1190 ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_COORDINATES_OUT_OF_RANGE); 1191 return 0; 1192 } 1193 1194 ecp_nistz256_mod_inverse(z_inv3, point_z); 1195 ecp_nistz256_sqr_mont(z_inv2, z_inv3); 1196 ecp_nistz256_mul_mont(x_aff, z_inv2, point_x); 1197 1198 if (x != NULL) { 1199 ecp_nistz256_from_mont(x_ret, x_aff); 1200 if (!bn_set_words(x, x_ret, P256_LIMBS)) 1201 return 0; 1202 } 1203 1204 if (y != NULL) { 1205 ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2); 1206 ecp_nistz256_mul_mont(y_aff, z_inv3, point_y); 1207 ecp_nistz256_from_mont(y_ret, y_aff); 1208 if (!bn_set_words(y, y_ret, P256_LIMBS)) 1209 return 0; 1210 } 1211 1212 return 1; 1213 } 1214 1215 static NISTZ256_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group) 1216 { 1217 NISTZ256_PRE_COMP *ret = NULL; 1218 1219 if (!group) 1220 return NULL; 1221 1222 ret = OPENSSL_zalloc(sizeof(*ret)); 1223 1224 if (ret == NULL) { 1225 ECerr(EC_F_ECP_NISTZ256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); 1226 return ret; 1227 } 1228 1229 ret->group = group; 1230 ret->w = 6; /* default */ 1231 ret->references = 1; 1232 1233 ret->lock = CRYPTO_THREAD_lock_new(); 1234 if (ret->lock == NULL) { 1235 ECerr(EC_F_ECP_NISTZ256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); 1236 OPENSSL_free(ret); 1237 return NULL; 1238 } 1239 return ret; 1240 } 1241 1242 NISTZ256_PRE_COMP *EC_nistz256_pre_comp_dup(NISTZ256_PRE_COMP *p) 1243 { 1244 int i; 1245 if (p != NULL) 1246 CRYPTO_UP_REF(&p->references, &i, p->lock); 1247 return p; 1248 } 1249 1250 void EC_nistz256_pre_comp_free(NISTZ256_PRE_COMP *pre) 1251 { 1252 int i; 1253 1254 if (pre == NULL) 1255 return; 1256 1257 CRYPTO_DOWN_REF(&pre->references, &i, pre->lock); 1258 REF_PRINT_COUNT("EC_nistz256", x); 1259 if (i > 0) 1260 return; 1261 REF_ASSERT_ISNT(i < 0); 1262 1263 OPENSSL_free(pre->precomp_storage); 1264 CRYPTO_THREAD_lock_free(pre->lock); 1265 OPENSSL_free(pre); 1266 } 1267 1268 1269 static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group) 1270 { 1271 /* There is a hard-coded table for the default generator. */ 1272 const EC_POINT *generator = EC_GROUP_get0_generator(group); 1273 1274 if (generator != NULL && ecp_nistz256_is_affine_G(generator)) { 1275 /* There is a hard-coded table for the default generator. */ 1276 return 1; 1277 } 1278 1279 return HAVEPRECOMP(group, nistz256); 1280 } 1281 1282 #if defined(__x86_64) || defined(__x86_64__) || \ 1283 defined(_M_AMD64) || defined(_M_X64) || \ 1284 defined(__powerpc64__) || defined(_ARCH_PP64) || \ 1285 defined(__aarch64__) 1286 /* 1287 * Montgomery mul modulo Order(P): res = a*b*2^-256 mod Order(P) 1288 */ 1289 void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS], 1290 const BN_ULONG a[P256_LIMBS], 1291 const BN_ULONG b[P256_LIMBS]); 1292 void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS], 1293 const BN_ULONG a[P256_LIMBS], 1294 int rep); 1295 1296 static int ecp_nistz256_inv_mod_ord(const EC_GROUP *group, BIGNUM *r, 1297 const BIGNUM *x, BN_CTX *ctx) 1298 { 1299 /* RR = 2^512 mod ord(p256) */ 1300 static const BN_ULONG RR[P256_LIMBS] = { 1301 TOBN(0x83244c95,0xbe79eea2), TOBN(0x4699799c,0x49bd6fa6), 1302 TOBN(0x2845b239,0x2b6bec59), TOBN(0x66e12d94,0xf3d95620) 1303 }; 1304 /* The constant 1 (unlike ONE that is one in Montgomery representation) */ 1305 static const BN_ULONG one[P256_LIMBS] = { 1306 TOBN(0,1), TOBN(0,0), TOBN(0,0), TOBN(0,0) 1307 }; 1308 /* 1309 * We don't use entry 0 in the table, so we omit it and address 1310 * with -1 offset. 1311 */ 1312 BN_ULONG table[15][P256_LIMBS]; 1313 BN_ULONG out[P256_LIMBS], t[P256_LIMBS]; 1314 int i, ret = 0; 1315 enum { 1316 i_1 = 0, i_10, i_11, i_101, i_111, i_1010, i_1111, 1317 i_10101, i_101010, i_101111, i_x6, i_x8, i_x16, i_x32 1318 }; 1319 1320 /* 1321 * Catch allocation failure early. 1322 */ 1323 if (bn_wexpand(r, P256_LIMBS) == NULL) { 1324 ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB); 1325 goto err; 1326 } 1327 1328 if ((BN_num_bits(x) > 256) || BN_is_negative(x)) { 1329 BIGNUM *tmp; 1330 1331 if ((tmp = BN_CTX_get(ctx)) == NULL 1332 || !BN_nnmod(tmp, x, group->order, ctx)) { 1333 ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB); 1334 goto err; 1335 } 1336 x = tmp; 1337 } 1338 1339 if (!ecp_nistz256_bignum_to_field_elem(t, x)) { 1340 ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, EC_R_COORDINATES_OUT_OF_RANGE); 1341 goto err; 1342 } 1343 1344 ecp_nistz256_ord_mul_mont(table[0], t, RR); 1345 #if 0 1346 /* 1347 * Original sparse-then-fixed-window algorithm, retained for reference. 1348 */ 1349 for (i = 2; i < 16; i += 2) { 1350 ecp_nistz256_ord_sqr_mont(table[i-1], table[i/2-1], 1); 1351 ecp_nistz256_ord_mul_mont(table[i], table[i-1], table[0]); 1352 } 1353 1354 /* 1355 * The top 128bit of the exponent are highly redudndant, so we 1356 * perform an optimized flow 1357 */ 1358 ecp_nistz256_ord_sqr_mont(t, table[15-1], 4); /* f0 */ 1359 ecp_nistz256_ord_mul_mont(t, t, table[15-1]); /* ff */ 1360 1361 ecp_nistz256_ord_sqr_mont(out, t, 8); /* ff00 */ 1362 ecp_nistz256_ord_mul_mont(out, out, t); /* ffff */ 1363 1364 ecp_nistz256_ord_sqr_mont(t, out, 16); /* ffff0000 */ 1365 ecp_nistz256_ord_mul_mont(t, t, out); /* ffffffff */ 1366 1367 ecp_nistz256_ord_sqr_mont(out, t, 64); /* ffffffff0000000000000000 */ 1368 ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffff */ 1369 1370 ecp_nistz256_ord_sqr_mont(out, out, 32); /* ffffffff00000000ffffffff00000000 */ 1371 ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffffffffffff */ 1372 1373 /* 1374 * The bottom 128 bit of the exponent are processed with fixed 4-bit window 1375 */ 1376 for(i = 0; i < 32; i++) { 1377 /* expLo - the low 128 bits of the exponent we use (ord(p256) - 2), 1378 * split into nibbles */ 1379 static const unsigned char expLo[32] = { 1380 0xb,0xc,0xe,0x6,0xf,0xa,0xa,0xd,0xa,0x7,0x1,0x7,0x9,0xe,0x8,0x4, 1381 0xf,0x3,0xb,0x9,0xc,0xa,0xc,0x2,0xf,0xc,0x6,0x3,0x2,0x5,0x4,0xf 1382 }; 1383 1384 ecp_nistz256_ord_sqr_mont(out, out, 4); 1385 /* The exponent is public, no need in constant-time access */ 1386 ecp_nistz256_ord_mul_mont(out, out, table[expLo[i]-1]); 1387 } 1388 #else 1389 /* 1390 * https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion 1391 * 1392 * Even though this code path spares 12 squarings, 4.5%, and 13 1393 * multiplications, 25%, on grand scale sign operation is not that 1394 * much faster, not more that 2%... 1395 */ 1396 1397 /* pre-calculate powers */ 1398 ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], 1); 1399 1400 ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]); 1401 1402 ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]); 1403 1404 ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]); 1405 1406 ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], 1); 1407 1408 ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]); 1409 1410 ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], 1); 1411 ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]); 1412 1413 ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], 1); 1414 1415 ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]); 1416 1417 ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]); 1418 1419 ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], 2); 1420 ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]); 1421 1422 ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], 8); 1423 ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]); 1424 1425 ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], 16); 1426 ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]); 1427 1428 /* calculations */ 1429 ecp_nistz256_ord_sqr_mont(out, table[i_x32], 64); 1430 ecp_nistz256_ord_mul_mont(out, out, table[i_x32]); 1431 1432 for (i = 0; i < 27; i++) { 1433 static const struct { unsigned char p, i; } chain[27] = { 1434 { 32, i_x32 }, { 6, i_101111 }, { 5, i_111 }, 1435 { 4, i_11 }, { 5, i_1111 }, { 5, i_10101 }, 1436 { 4, i_101 }, { 3, i_101 }, { 3, i_101 }, 1437 { 5, i_111 }, { 9, i_101111 }, { 6, i_1111 }, 1438 { 2, i_1 }, { 5, i_1 }, { 6, i_1111 }, 1439 { 5, i_111 }, { 4, i_111 }, { 5, i_111 }, 1440 { 5, i_101 }, { 3, i_11 }, { 10, i_101111 }, 1441 { 2, i_11 }, { 5, i_11 }, { 5, i_11 }, 1442 { 3, i_1 }, { 7, i_10101 }, { 6, i_1111 } 1443 }; 1444 1445 ecp_nistz256_ord_sqr_mont(out, out, chain[i].p); 1446 ecp_nistz256_ord_mul_mont(out, out, table[chain[i].i]); 1447 } 1448 #endif 1449 ecp_nistz256_ord_mul_mont(out, out, one); 1450 1451 /* 1452 * Can't fail, but check return code to be consistent anyway. 1453 */ 1454 if (!bn_set_words(r, out, P256_LIMBS)) 1455 goto err; 1456 1457 ret = 1; 1458 err: 1459 return ret; 1460 } 1461 #else 1462 # define ecp_nistz256_inv_mod_ord NULL 1463 #endif 1464 1465 const EC_METHOD *EC_GFp_nistz256_method(void) 1466 { 1467 static const EC_METHOD ret = { 1468 EC_FLAGS_DEFAULT_OCT, 1469 NID_X9_62_prime_field, 1470 ec_GFp_mont_group_init, 1471 ec_GFp_mont_group_finish, 1472 ec_GFp_mont_group_clear_finish, 1473 ec_GFp_mont_group_copy, 1474 ec_GFp_mont_group_set_curve, 1475 ec_GFp_simple_group_get_curve, 1476 ec_GFp_simple_group_get_degree, 1477 ec_group_simple_order_bits, 1478 ec_GFp_simple_group_check_discriminant, 1479 ec_GFp_simple_point_init, 1480 ec_GFp_simple_point_finish, 1481 ec_GFp_simple_point_clear_finish, 1482 ec_GFp_simple_point_copy, 1483 ec_GFp_simple_point_set_to_infinity, 1484 ec_GFp_simple_set_Jprojective_coordinates_GFp, 1485 ec_GFp_simple_get_Jprojective_coordinates_GFp, 1486 ec_GFp_simple_point_set_affine_coordinates, 1487 ecp_nistz256_get_affine, 1488 0, 0, 0, 1489 ec_GFp_simple_add, 1490 ec_GFp_simple_dbl, 1491 ec_GFp_simple_invert, 1492 ec_GFp_simple_is_at_infinity, 1493 ec_GFp_simple_is_on_curve, 1494 ec_GFp_simple_cmp, 1495 ec_GFp_simple_make_affine, 1496 ec_GFp_simple_points_make_affine, 1497 ecp_nistz256_points_mul, /* mul */ 1498 ecp_nistz256_mult_precompute, /* precompute_mult */ 1499 ecp_nistz256_window_have_precompute_mult, /* have_precompute_mult */ 1500 ec_GFp_mont_field_mul, 1501 ec_GFp_mont_field_sqr, 1502 0, /* field_div */ 1503 ec_GFp_mont_field_inv, 1504 ec_GFp_mont_field_encode, 1505 ec_GFp_mont_field_decode, 1506 ec_GFp_mont_field_set_to_one, 1507 ec_key_simple_priv2oct, 1508 ec_key_simple_oct2priv, 1509 0, /* set private */ 1510 ec_key_simple_generate_key, 1511 ec_key_simple_check_key, 1512 ec_key_simple_generate_public_key, 1513 0, /* keycopy */ 1514 0, /* keyfinish */ 1515 ecdh_simple_compute_key, 1516 ecp_nistz256_inv_mod_ord, /* can be #define-d NULL */ 1517 0, /* blind_coordinates */ 1518 0, /* ladder_pre */ 1519 0, /* ladder_step */ 1520 0 /* ladder_post */ 1521 }; 1522 1523 return &ret; 1524 } 1525