1 /* 2 * Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. 3 * Copyright (c) 2014, Intel Corporation. All Rights Reserved. 4 * Copyright (c) 2015, CloudFlare, Inc. 5 * 6 * Licensed under the OpenSSL license (the "License"). You may not use 7 * this file except in compliance with the License. You can obtain a copy 8 * in the file LICENSE in the source distribution or at 9 * https://www.openssl.org/source/license.html 10 * 11 * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) 12 * (1) Intel Corporation, Israel Development Center, Haifa, Israel 13 * (2) University of Haifa, Israel 14 * (3) CloudFlare, Inc. 15 * 16 * Reference: 17 * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with 18 * 256 Bit Primes" 19 */ 20 21 #include <string.h> 22 23 #include "internal/cryptlib.h" 24 #include "crypto/bn.h" 25 #include "ec_local.h" 26 #include "internal/refcount.h" 27 28 #if BN_BITS2 != 64 29 # define TOBN(hi,lo) lo,hi 30 #else 31 # define TOBN(hi,lo) ((BN_ULONG)hi<<32|lo) 32 #endif 33 34 #if defined(__GNUC__) 35 # define ALIGN32 __attribute((aligned(32))) 36 #elif defined(_MSC_VER) 37 # define ALIGN32 __declspec(align(32)) 38 #else 39 # define ALIGN32 40 #endif 41 42 #define ALIGNPTR(p,N) ((unsigned char *)p+N-(size_t)p%N) 43 #define P256_LIMBS (256/BN_BITS2) 44 45 typedef unsigned short u16; 46 47 typedef struct { 48 BN_ULONG X[P256_LIMBS]; 49 BN_ULONG Y[P256_LIMBS]; 50 BN_ULONG Z[P256_LIMBS]; 51 } P256_POINT; 52 53 typedef struct { 54 BN_ULONG X[P256_LIMBS]; 55 BN_ULONG Y[P256_LIMBS]; 56 } P256_POINT_AFFINE; 57 58 typedef P256_POINT_AFFINE PRECOMP256_ROW[64]; 59 60 /* structure for precomputed multiples of the generator */ 61 struct nistz256_pre_comp_st { 62 const EC_GROUP *group; /* Parent EC_GROUP object */ 63 size_t w; /* Window size */ 64 /* 65 * Constant time access to the X and Y coordinates of the pre-computed, 66 * generator multiplies, in the Montgomery domain. Pre-calculated 67 * multiplies are stored in affine form. 68 */ 69 PRECOMP256_ROW *precomp; 70 void *precomp_storage; 71 CRYPTO_REF_COUNT references; 72 CRYPTO_RWLOCK *lock; 73 }; 74 75 /* Functions implemented in assembly */ 76 /* 77 * Most of below mentioned functions *preserve* the property of inputs 78 * being fully reduced, i.e. being in [0, modulus) range. Simply put if 79 * inputs are fully reduced, then output is too. Note that reverse is 80 * not true, in sense that given partially reduced inputs output can be 81 * either, not unlikely reduced. And "most" in first sentence refers to 82 * the fact that given the calculations flow one can tolerate that 83 * addition, 1st function below, produces partially reduced result *if* 84 * multiplications by 2 and 3, which customarily use addition, fully 85 * reduce it. This effectively gives two options: a) addition produces 86 * fully reduced result [as long as inputs are, just like remaining 87 * functions]; b) addition is allowed to produce partially reduced 88 * result, but multiplications by 2 and 3 perform additional reduction 89 * step. Choice between the two can be platform-specific, but it was a) 90 * in all cases so far... 91 */ 92 /* Modular add: res = a+b mod P */ 93 void ecp_nistz256_add(BN_ULONG res[P256_LIMBS], 94 const BN_ULONG a[P256_LIMBS], 95 const BN_ULONG b[P256_LIMBS]); 96 /* Modular mul by 2: res = 2*a mod P */ 97 void ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS], 98 const BN_ULONG a[P256_LIMBS]); 99 /* Modular mul by 3: res = 3*a mod P */ 100 void ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS], 101 const BN_ULONG a[P256_LIMBS]); 102 103 /* Modular div by 2: res = a/2 mod P */ 104 void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS], 105 const BN_ULONG a[P256_LIMBS]); 106 /* Modular sub: res = a-b mod P */ 107 void ecp_nistz256_sub(BN_ULONG res[P256_LIMBS], 108 const BN_ULONG a[P256_LIMBS], 109 const BN_ULONG b[P256_LIMBS]); 110 /* Modular neg: res = -a mod P */ 111 void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); 112 /* Montgomery mul: res = a*b*2^-256 mod P */ 113 void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], 114 const BN_ULONG a[P256_LIMBS], 115 const BN_ULONG b[P256_LIMBS]); 116 /* Montgomery sqr: res = a*a*2^-256 mod P */ 117 void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], 118 const BN_ULONG a[P256_LIMBS]); 119 /* Convert a number from Montgomery domain, by multiplying with 1 */ 120 void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS], 121 const BN_ULONG in[P256_LIMBS]); 122 /* Convert a number to Montgomery domain, by multiplying with 2^512 mod P*/ 123 void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS], 124 const BN_ULONG in[P256_LIMBS]); 125 /* Functions that perform constant time access to the precomputed tables */ 126 void ecp_nistz256_scatter_w5(P256_POINT *val, 127 const P256_POINT *in_t, int idx); 128 void ecp_nistz256_gather_w5(P256_POINT *val, 129 const P256_POINT *in_t, int idx); 130 void ecp_nistz256_scatter_w7(P256_POINT_AFFINE *val, 131 const P256_POINT_AFFINE *in_t, int idx); 132 void ecp_nistz256_gather_w7(P256_POINT_AFFINE *val, 133 const P256_POINT_AFFINE *in_t, int idx); 134 135 /* One converted into the Montgomery domain */ 136 static const BN_ULONG ONE[P256_LIMBS] = { 137 TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000), 138 TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe) 139 }; 140 141 static NISTZ256_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group); 142 143 /* Precomputed tables for the default generator */ 144 extern const PRECOMP256_ROW ecp_nistz256_precomputed[37]; 145 146 /* Recode window to a signed digit, see ecp_nistputil.c for details */ 147 static unsigned int _booth_recode_w5(unsigned int in) 148 { 149 unsigned int s, d; 150 151 s = ~((in >> 5) - 1); 152 d = (1 << 6) - in - 1; 153 d = (d & s) | (in & ~s); 154 d = (d >> 1) + (d & 1); 155 156 return (d << 1) + (s & 1); 157 } 158 159 static unsigned int _booth_recode_w7(unsigned int in) 160 { 161 unsigned int s, d; 162 163 s = ~((in >> 7) - 1); 164 d = (1 << 8) - in - 1; 165 d = (d & s) | (in & ~s); 166 d = (d >> 1) + (d & 1); 167 168 return (d << 1) + (s & 1); 169 } 170 171 static void copy_conditional(BN_ULONG dst[P256_LIMBS], 172 const BN_ULONG src[P256_LIMBS], BN_ULONG move) 173 { 174 BN_ULONG mask1 = 0-move; 175 BN_ULONG mask2 = ~mask1; 176 177 dst[0] = (src[0] & mask1) ^ (dst[0] & mask2); 178 dst[1] = (src[1] & mask1) ^ (dst[1] & mask2); 179 dst[2] = (src[2] & mask1) ^ (dst[2] & mask2); 180 dst[3] = (src[3] & mask1) ^ (dst[3] & mask2); 181 if (P256_LIMBS == 8) { 182 dst[4] = (src[4] & mask1) ^ (dst[4] & mask2); 183 dst[5] = (src[5] & mask1) ^ (dst[5] & mask2); 184 dst[6] = (src[6] & mask1) ^ (dst[6] & mask2); 185 dst[7] = (src[7] & mask1) ^ (dst[7] & mask2); 186 } 187 } 188 189 static BN_ULONG is_zero(BN_ULONG in) 190 { 191 in |= (0 - in); 192 in = ~in; 193 in >>= BN_BITS2 - 1; 194 return in; 195 } 196 197 static BN_ULONG is_equal(const BN_ULONG a[P256_LIMBS], 198 const BN_ULONG b[P256_LIMBS]) 199 { 200 BN_ULONG res; 201 202 res = a[0] ^ b[0]; 203 res |= a[1] ^ b[1]; 204 res |= a[2] ^ b[2]; 205 res |= a[3] ^ b[3]; 206 if (P256_LIMBS == 8) { 207 res |= a[4] ^ b[4]; 208 res |= a[5] ^ b[5]; 209 res |= a[6] ^ b[6]; 210 res |= a[7] ^ b[7]; 211 } 212 213 return is_zero(res); 214 } 215 216 static BN_ULONG is_one(const BIGNUM *z) 217 { 218 BN_ULONG res = 0; 219 BN_ULONG *a = bn_get_words(z); 220 221 if (bn_get_top(z) == (P256_LIMBS - P256_LIMBS / 8)) { 222 res = a[0] ^ ONE[0]; 223 res |= a[1] ^ ONE[1]; 224 res |= a[2] ^ ONE[2]; 225 res |= a[3] ^ ONE[3]; 226 if (P256_LIMBS == 8) { 227 res |= a[4] ^ ONE[4]; 228 res |= a[5] ^ ONE[5]; 229 res |= a[6] ^ ONE[6]; 230 /* 231 * no check for a[7] (being zero) on 32-bit platforms, 232 * because value of "one" takes only 7 limbs. 233 */ 234 } 235 res = is_zero(res); 236 } 237 238 return res; 239 } 240 241 /* 242 * For reference, this macro is used only when new ecp_nistz256 assembly 243 * module is being developed. For example, configure with 244 * -DECP_NISTZ256_REFERENCE_IMPLEMENTATION and implement only functions 245 * performing simplest arithmetic operations on 256-bit vectors. Then 246 * work on implementation of higher-level functions performing point 247 * operations. Then remove ECP_NISTZ256_REFERENCE_IMPLEMENTATION 248 * and never define it again. (The correct macro denoting presence of 249 * ecp_nistz256 module is ECP_NISTZ256_ASM.) 250 */ 251 #ifndef ECP_NISTZ256_REFERENCE_IMPLEMENTATION 252 void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a); 253 void ecp_nistz256_point_add(P256_POINT *r, 254 const P256_POINT *a, const P256_POINT *b); 255 void ecp_nistz256_point_add_affine(P256_POINT *r, 256 const P256_POINT *a, 257 const P256_POINT_AFFINE *b); 258 #else 259 /* Point double: r = 2*a */ 260 static void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) 261 { 262 BN_ULONG S[P256_LIMBS]; 263 BN_ULONG M[P256_LIMBS]; 264 BN_ULONG Zsqr[P256_LIMBS]; 265 BN_ULONG tmp0[P256_LIMBS]; 266 267 const BN_ULONG *in_x = a->X; 268 const BN_ULONG *in_y = a->Y; 269 const BN_ULONG *in_z = a->Z; 270 271 BN_ULONG *res_x = r->X; 272 BN_ULONG *res_y = r->Y; 273 BN_ULONG *res_z = r->Z; 274 275 ecp_nistz256_mul_by_2(S, in_y); 276 277 ecp_nistz256_sqr_mont(Zsqr, in_z); 278 279 ecp_nistz256_sqr_mont(S, S); 280 281 ecp_nistz256_mul_mont(res_z, in_z, in_y); 282 ecp_nistz256_mul_by_2(res_z, res_z); 283 284 ecp_nistz256_add(M, in_x, Zsqr); 285 ecp_nistz256_sub(Zsqr, in_x, Zsqr); 286 287 ecp_nistz256_sqr_mont(res_y, S); 288 ecp_nistz256_div_by_2(res_y, res_y); 289 290 ecp_nistz256_mul_mont(M, M, Zsqr); 291 ecp_nistz256_mul_by_3(M, M); 292 293 ecp_nistz256_mul_mont(S, S, in_x); 294 ecp_nistz256_mul_by_2(tmp0, S); 295 296 ecp_nistz256_sqr_mont(res_x, M); 297 298 ecp_nistz256_sub(res_x, res_x, tmp0); 299 ecp_nistz256_sub(S, S, res_x); 300 301 ecp_nistz256_mul_mont(S, S, M); 302 ecp_nistz256_sub(res_y, S, res_y); 303 } 304 305 /* Point addition: r = a+b */ 306 static void ecp_nistz256_point_add(P256_POINT *r, 307 const P256_POINT *a, const P256_POINT *b) 308 { 309 BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS]; 310 BN_ULONG U1[P256_LIMBS], S1[P256_LIMBS]; 311 BN_ULONG Z1sqr[P256_LIMBS]; 312 BN_ULONG Z2sqr[P256_LIMBS]; 313 BN_ULONG H[P256_LIMBS], R[P256_LIMBS]; 314 BN_ULONG Hsqr[P256_LIMBS]; 315 BN_ULONG Rsqr[P256_LIMBS]; 316 BN_ULONG Hcub[P256_LIMBS]; 317 318 BN_ULONG res_x[P256_LIMBS]; 319 BN_ULONG res_y[P256_LIMBS]; 320 BN_ULONG res_z[P256_LIMBS]; 321 322 BN_ULONG in1infty, in2infty; 323 324 const BN_ULONG *in1_x = a->X; 325 const BN_ULONG *in1_y = a->Y; 326 const BN_ULONG *in1_z = a->Z; 327 328 const BN_ULONG *in2_x = b->X; 329 const BN_ULONG *in2_y = b->Y; 330 const BN_ULONG *in2_z = b->Z; 331 332 /* 333 * Infinity in encoded as (,,0) 334 */ 335 in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]); 336 if (P256_LIMBS == 8) 337 in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]); 338 339 in2infty = (in2_z[0] | in2_z[1] | in2_z[2] | in2_z[3]); 340 if (P256_LIMBS == 8) 341 in2infty |= (in2_z[4] | in2_z[5] | in2_z[6] | in2_z[7]); 342 343 in1infty = is_zero(in1infty); 344 in2infty = is_zero(in2infty); 345 346 ecp_nistz256_sqr_mont(Z2sqr, in2_z); /* Z2^2 */ 347 ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ 348 349 ecp_nistz256_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */ 350 ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ 351 352 ecp_nistz256_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */ 353 ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ 354 ecp_nistz256_sub(R, S2, S1); /* R = S2 - S1 */ 355 356 ecp_nistz256_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */ 357 ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ 358 ecp_nistz256_sub(H, U2, U1); /* H = U2 - U1 */ 359 360 /* 361 * The formulae are incorrect if the points are equal so we check for 362 * this and do doubling if this happens. 363 * 364 * Points here are in Jacobian projective coordinates (Xi, Yi, Zi) 365 * that are bound to the affine coordinates (xi, yi) by the following 366 * equations: 367 * - xi = Xi / (Zi)^2 368 * - y1 = Yi / (Zi)^3 369 * 370 * For the sake of optimization, the algorithm operates over 371 * intermediate variables U1, U2 and S1, S2 that are derived from 372 * the projective coordinates: 373 * - U1 = X1 * (Z2)^2 ; U2 = X2 * (Z1)^2 374 * - S1 = Y1 * (Z2)^3 ; S2 = Y2 * (Z1)^3 375 * 376 * It is easy to prove that is_equal(U1, U2) implies that the affine 377 * x-coordinates are equal, or either point is at infinity. 378 * Likewise is_equal(S1, S2) implies that the affine y-coordinates are 379 * equal, or either point is at infinity. 380 * 381 * The special case of either point being the point at infinity (Z1 or Z2 382 * is zero), is handled separately later on in this function, so we avoid 383 * jumping to point_double here in those special cases. 384 * 385 * When both points are inverse of each other, we know that the affine 386 * x-coordinates are equal, and the y-coordinates have different sign. 387 * Therefore since U1 = U2, we know H = 0, and therefore Z3 = H*Z1*Z2 388 * will equal 0, thus the result is infinity, if we simply let this 389 * function continue normally. 390 * 391 * We use bitwise operations to avoid potential side-channels introduced by 392 * the short-circuiting behaviour of boolean operators. 393 */ 394 if (is_equal(U1, U2) & ~in1infty & ~in2infty & is_equal(S1, S2)) { 395 /* 396 * This is obviously not constant-time but it should never happen during 397 * single point multiplication, so there is no timing leak for ECDH or 398 * ECDSA signing. 399 */ 400 ecp_nistz256_point_double(r, a); 401 return; 402 } 403 404 ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */ 405 ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ 406 ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */ 407 ecp_nistz256_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */ 408 ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */ 409 410 ecp_nistz256_mul_mont(U2, U1, Hsqr); /* U1*H^2 */ 411 ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ 412 413 ecp_nistz256_sub(res_x, Rsqr, Hsqr); 414 ecp_nistz256_sub(res_x, res_x, Hcub); 415 416 ecp_nistz256_sub(res_y, U2, res_x); 417 418 ecp_nistz256_mul_mont(S2, S1, Hcub); 419 ecp_nistz256_mul_mont(res_y, R, res_y); 420 ecp_nistz256_sub(res_y, res_y, S2); 421 422 copy_conditional(res_x, in2_x, in1infty); 423 copy_conditional(res_y, in2_y, in1infty); 424 copy_conditional(res_z, in2_z, in1infty); 425 426 copy_conditional(res_x, in1_x, in2infty); 427 copy_conditional(res_y, in1_y, in2infty); 428 copy_conditional(res_z, in1_z, in2infty); 429 430 memcpy(r->X, res_x, sizeof(res_x)); 431 memcpy(r->Y, res_y, sizeof(res_y)); 432 memcpy(r->Z, res_z, sizeof(res_z)); 433 } 434 435 /* Point addition when b is known to be affine: r = a+b */ 436 static void ecp_nistz256_point_add_affine(P256_POINT *r, 437 const P256_POINT *a, 438 const P256_POINT_AFFINE *b) 439 { 440 BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS]; 441 BN_ULONG Z1sqr[P256_LIMBS]; 442 BN_ULONG H[P256_LIMBS], R[P256_LIMBS]; 443 BN_ULONG Hsqr[P256_LIMBS]; 444 BN_ULONG Rsqr[P256_LIMBS]; 445 BN_ULONG Hcub[P256_LIMBS]; 446 447 BN_ULONG res_x[P256_LIMBS]; 448 BN_ULONG res_y[P256_LIMBS]; 449 BN_ULONG res_z[P256_LIMBS]; 450 451 BN_ULONG in1infty, in2infty; 452 453 const BN_ULONG *in1_x = a->X; 454 const BN_ULONG *in1_y = a->Y; 455 const BN_ULONG *in1_z = a->Z; 456 457 const BN_ULONG *in2_x = b->X; 458 const BN_ULONG *in2_y = b->Y; 459 460 /* 461 * Infinity in encoded as (,,0) 462 */ 463 in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]); 464 if (P256_LIMBS == 8) 465 in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]); 466 467 /* 468 * In affine representation we encode infinity as (0,0), which is 469 * not on the curve, so it is OK 470 */ 471 in2infty = (in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] | 472 in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3]); 473 if (P256_LIMBS == 8) 474 in2infty |= (in2_x[4] | in2_x[5] | in2_x[6] | in2_x[7] | 475 in2_y[4] | in2_y[5] | in2_y[6] | in2_y[7]); 476 477 in1infty = is_zero(in1infty); 478 in2infty = is_zero(in2infty); 479 480 ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ 481 482 ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ 483 ecp_nistz256_sub(H, U2, in1_x); /* H = U2 - U1 */ 484 485 ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ 486 487 ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ 488 489 ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ 490 ecp_nistz256_sub(R, S2, in1_y); /* R = S2 - S1 */ 491 492 ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */ 493 ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */ 494 ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */ 495 496 ecp_nistz256_mul_mont(U2, in1_x, Hsqr); /* U1*H^2 */ 497 ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ 498 499 ecp_nistz256_sub(res_x, Rsqr, Hsqr); 500 ecp_nistz256_sub(res_x, res_x, Hcub); 501 ecp_nistz256_sub(H, U2, res_x); 502 503 ecp_nistz256_mul_mont(S2, in1_y, Hcub); 504 ecp_nistz256_mul_mont(H, H, R); 505 ecp_nistz256_sub(res_y, H, S2); 506 507 copy_conditional(res_x, in2_x, in1infty); 508 copy_conditional(res_x, in1_x, in2infty); 509 510 copy_conditional(res_y, in2_y, in1infty); 511 copy_conditional(res_y, in1_y, in2infty); 512 513 copy_conditional(res_z, ONE, in1infty); 514 copy_conditional(res_z, in1_z, in2infty); 515 516 memcpy(r->X, res_x, sizeof(res_x)); 517 memcpy(r->Y, res_y, sizeof(res_y)); 518 memcpy(r->Z, res_z, sizeof(res_z)); 519 } 520 #endif 521 522 /* r = in^-1 mod p */ 523 static void ecp_nistz256_mod_inverse(BN_ULONG r[P256_LIMBS], 524 const BN_ULONG in[P256_LIMBS]) 525 { 526 /* 527 * The poly is ffffffff 00000001 00000000 00000000 00000000 ffffffff 528 * ffffffff ffffffff We use FLT and used poly-2 as exponent 529 */ 530 BN_ULONG p2[P256_LIMBS]; 531 BN_ULONG p4[P256_LIMBS]; 532 BN_ULONG p8[P256_LIMBS]; 533 BN_ULONG p16[P256_LIMBS]; 534 BN_ULONG p32[P256_LIMBS]; 535 BN_ULONG res[P256_LIMBS]; 536 int i; 537 538 ecp_nistz256_sqr_mont(res, in); 539 ecp_nistz256_mul_mont(p2, res, in); /* 3*p */ 540 541 ecp_nistz256_sqr_mont(res, p2); 542 ecp_nistz256_sqr_mont(res, res); 543 ecp_nistz256_mul_mont(p4, res, p2); /* f*p */ 544 545 ecp_nistz256_sqr_mont(res, p4); 546 ecp_nistz256_sqr_mont(res, res); 547 ecp_nistz256_sqr_mont(res, res); 548 ecp_nistz256_sqr_mont(res, res); 549 ecp_nistz256_mul_mont(p8, res, p4); /* ff*p */ 550 551 ecp_nistz256_sqr_mont(res, p8); 552 for (i = 0; i < 7; i++) 553 ecp_nistz256_sqr_mont(res, res); 554 ecp_nistz256_mul_mont(p16, res, p8); /* ffff*p */ 555 556 ecp_nistz256_sqr_mont(res, p16); 557 for (i = 0; i < 15; i++) 558 ecp_nistz256_sqr_mont(res, res); 559 ecp_nistz256_mul_mont(p32, res, p16); /* ffffffff*p */ 560 561 ecp_nistz256_sqr_mont(res, p32); 562 for (i = 0; i < 31; i++) 563 ecp_nistz256_sqr_mont(res, res); 564 ecp_nistz256_mul_mont(res, res, in); 565 566 for (i = 0; i < 32 * 4; i++) 567 ecp_nistz256_sqr_mont(res, res); 568 ecp_nistz256_mul_mont(res, res, p32); 569 570 for (i = 0; i < 32; i++) 571 ecp_nistz256_sqr_mont(res, res); 572 ecp_nistz256_mul_mont(res, res, p32); 573 574 for (i = 0; i < 16; i++) 575 ecp_nistz256_sqr_mont(res, res); 576 ecp_nistz256_mul_mont(res, res, p16); 577 578 for (i = 0; i < 8; i++) 579 ecp_nistz256_sqr_mont(res, res); 580 ecp_nistz256_mul_mont(res, res, p8); 581 582 ecp_nistz256_sqr_mont(res, res); 583 ecp_nistz256_sqr_mont(res, res); 584 ecp_nistz256_sqr_mont(res, res); 585 ecp_nistz256_sqr_mont(res, res); 586 ecp_nistz256_mul_mont(res, res, p4); 587 588 ecp_nistz256_sqr_mont(res, res); 589 ecp_nistz256_sqr_mont(res, res); 590 ecp_nistz256_mul_mont(res, res, p2); 591 592 ecp_nistz256_sqr_mont(res, res); 593 ecp_nistz256_sqr_mont(res, res); 594 ecp_nistz256_mul_mont(res, res, in); 595 596 memcpy(r, res, sizeof(res)); 597 } 598 599 /* 600 * ecp_nistz256_bignum_to_field_elem copies the contents of |in| to |out| and 601 * returns one if it fits. Otherwise it returns zero. 602 */ 603 __owur static int ecp_nistz256_bignum_to_field_elem(BN_ULONG out[P256_LIMBS], 604 const BIGNUM *in) 605 { 606 return bn_copy_words(out, in, P256_LIMBS); 607 } 608 609 /* r = sum(scalar[i]*point[i]) */ 610 __owur static int ecp_nistz256_windowed_mul(const EC_GROUP *group, 611 P256_POINT *r, 612 const BIGNUM **scalar, 613 const EC_POINT **point, 614 size_t num, BN_CTX *ctx) 615 { 616 size_t i; 617 int j, ret = 0; 618 unsigned int idx; 619 unsigned char (*p_str)[33] = NULL; 620 const unsigned int window_size = 5; 621 const unsigned int mask = (1 << (window_size + 1)) - 1; 622 unsigned int wvalue; 623 P256_POINT *temp; /* place for 5 temporary points */ 624 const BIGNUM **scalars = NULL; 625 P256_POINT (*table)[16] = NULL; 626 void *table_storage = NULL; 627 628 if ((num * 16 + 6) > OPENSSL_MALLOC_MAX_NELEMS(P256_POINT) 629 || (table_storage = 630 OPENSSL_malloc((num * 16 + 5) * sizeof(P256_POINT) + 64)) == NULL 631 || (p_str = 632 OPENSSL_malloc(num * 33 * sizeof(unsigned char))) == NULL 633 || (scalars = OPENSSL_malloc(num * sizeof(BIGNUM *))) == NULL) { 634 ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_MALLOC_FAILURE); 635 goto err; 636 } 637 638 table = (void *)ALIGNPTR(table_storage, 64); 639 temp = (P256_POINT *)(table + num); 640 641 for (i = 0; i < num; i++) { 642 P256_POINT *row = table[i]; 643 644 /* This is an unusual input, we don't guarantee constant-timeness. */ 645 if ((BN_num_bits(scalar[i]) > 256) || BN_is_negative(scalar[i])) { 646 BIGNUM *mod; 647 648 if ((mod = BN_CTX_get(ctx)) == NULL) 649 goto err; 650 if (!BN_nnmod(mod, scalar[i], group->order, ctx)) { 651 ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_BN_LIB); 652 goto err; 653 } 654 scalars[i] = mod; 655 } else 656 scalars[i] = scalar[i]; 657 658 for (j = 0; j < bn_get_top(scalars[i]) * BN_BYTES; j += BN_BYTES) { 659 BN_ULONG d = bn_get_words(scalars[i])[j / BN_BYTES]; 660 661 p_str[i][j + 0] = (unsigned char)d; 662 p_str[i][j + 1] = (unsigned char)(d >> 8); 663 p_str[i][j + 2] = (unsigned char)(d >> 16); 664 p_str[i][j + 3] = (unsigned char)(d >>= 24); 665 if (BN_BYTES == 8) { 666 d >>= 8; 667 p_str[i][j + 4] = (unsigned char)d; 668 p_str[i][j + 5] = (unsigned char)(d >> 8); 669 p_str[i][j + 6] = (unsigned char)(d >> 16); 670 p_str[i][j + 7] = (unsigned char)(d >> 24); 671 } 672 } 673 for (; j < 33; j++) 674 p_str[i][j] = 0; 675 676 if (!ecp_nistz256_bignum_to_field_elem(temp[0].X, point[i]->X) 677 || !ecp_nistz256_bignum_to_field_elem(temp[0].Y, point[i]->Y) 678 || !ecp_nistz256_bignum_to_field_elem(temp[0].Z, point[i]->Z)) { 679 ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, 680 EC_R_COORDINATES_OUT_OF_RANGE); 681 goto err; 682 } 683 684 /* 685 * row[0] is implicitly (0,0,0) (the point at infinity), therefore it 686 * is not stored. All other values are actually stored with an offset 687 * of -1 in table. 688 */ 689 690 ecp_nistz256_scatter_w5 (row, &temp[0], 1); 691 ecp_nistz256_point_double(&temp[1], &temp[0]); /*1+1=2 */ 692 ecp_nistz256_scatter_w5 (row, &temp[1], 2); 693 ecp_nistz256_point_add (&temp[2], &temp[1], &temp[0]); /*2+1=3 */ 694 ecp_nistz256_scatter_w5 (row, &temp[2], 3); 695 ecp_nistz256_point_double(&temp[1], &temp[1]); /*2*2=4 */ 696 ecp_nistz256_scatter_w5 (row, &temp[1], 4); 697 ecp_nistz256_point_double(&temp[2], &temp[2]); /*2*3=6 */ 698 ecp_nistz256_scatter_w5 (row, &temp[2], 6); 699 ecp_nistz256_point_add (&temp[3], &temp[1], &temp[0]); /*4+1=5 */ 700 ecp_nistz256_scatter_w5 (row, &temp[3], 5); 701 ecp_nistz256_point_add (&temp[4], &temp[2], &temp[0]); /*6+1=7 */ 702 ecp_nistz256_scatter_w5 (row, &temp[4], 7); 703 ecp_nistz256_point_double(&temp[1], &temp[1]); /*2*4=8 */ 704 ecp_nistz256_scatter_w5 (row, &temp[1], 8); 705 ecp_nistz256_point_double(&temp[2], &temp[2]); /*2*6=12 */ 706 ecp_nistz256_scatter_w5 (row, &temp[2], 12); 707 ecp_nistz256_point_double(&temp[3], &temp[3]); /*2*5=10 */ 708 ecp_nistz256_scatter_w5 (row, &temp[3], 10); 709 ecp_nistz256_point_double(&temp[4], &temp[4]); /*2*7=14 */ 710 ecp_nistz256_scatter_w5 (row, &temp[4], 14); 711 ecp_nistz256_point_add (&temp[2], &temp[2], &temp[0]); /*12+1=13*/ 712 ecp_nistz256_scatter_w5 (row, &temp[2], 13); 713 ecp_nistz256_point_add (&temp[3], &temp[3], &temp[0]); /*10+1=11*/ 714 ecp_nistz256_scatter_w5 (row, &temp[3], 11); 715 ecp_nistz256_point_add (&temp[4], &temp[4], &temp[0]); /*14+1=15*/ 716 ecp_nistz256_scatter_w5 (row, &temp[4], 15); 717 ecp_nistz256_point_add (&temp[2], &temp[1], &temp[0]); /*8+1=9 */ 718 ecp_nistz256_scatter_w5 (row, &temp[2], 9); 719 ecp_nistz256_point_double(&temp[1], &temp[1]); /*2*8=16 */ 720 ecp_nistz256_scatter_w5 (row, &temp[1], 16); 721 } 722 723 idx = 255; 724 725 wvalue = p_str[0][(idx - 1) / 8]; 726 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 727 728 /* 729 * We gather to temp[0], because we know it's position relative 730 * to table 731 */ 732 ecp_nistz256_gather_w5(&temp[0], table[0], _booth_recode_w5(wvalue) >> 1); 733 memcpy(r, &temp[0], sizeof(temp[0])); 734 735 while (idx >= 5) { 736 for (i = (idx == 255 ? 1 : 0); i < num; i++) { 737 unsigned int off = (idx - 1) / 8; 738 739 wvalue = p_str[i][off] | p_str[i][off + 1] << 8; 740 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 741 742 wvalue = _booth_recode_w5(wvalue); 743 744 ecp_nistz256_gather_w5(&temp[0], table[i], wvalue >> 1); 745 746 ecp_nistz256_neg(temp[1].Y, temp[0].Y); 747 copy_conditional(temp[0].Y, temp[1].Y, (wvalue & 1)); 748 749 ecp_nistz256_point_add(r, r, &temp[0]); 750 } 751 752 idx -= window_size; 753 754 ecp_nistz256_point_double(r, r); 755 ecp_nistz256_point_double(r, r); 756 ecp_nistz256_point_double(r, r); 757 ecp_nistz256_point_double(r, r); 758 ecp_nistz256_point_double(r, r); 759 } 760 761 /* Final window */ 762 for (i = 0; i < num; i++) { 763 wvalue = p_str[i][0]; 764 wvalue = (wvalue << 1) & mask; 765 766 wvalue = _booth_recode_w5(wvalue); 767 768 ecp_nistz256_gather_w5(&temp[0], table[i], wvalue >> 1); 769 770 ecp_nistz256_neg(temp[1].Y, temp[0].Y); 771 copy_conditional(temp[0].Y, temp[1].Y, wvalue & 1); 772 773 ecp_nistz256_point_add(r, r, &temp[0]); 774 } 775 776 ret = 1; 777 err: 778 OPENSSL_free(table_storage); 779 OPENSSL_free(p_str); 780 OPENSSL_free(scalars); 781 return ret; 782 } 783 784 /* Coordinates of G, for which we have precomputed tables */ 785 static const BN_ULONG def_xG[P256_LIMBS] = { 786 TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601), 787 TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6) 788 }; 789 790 static const BN_ULONG def_yG[P256_LIMBS] = { 791 TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c), 792 TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85) 793 }; 794 795 /* 796 * ecp_nistz256_is_affine_G returns one if |generator| is the standard, P-256 797 * generator. 798 */ 799 static int ecp_nistz256_is_affine_G(const EC_POINT *generator) 800 { 801 return (bn_get_top(generator->X) == P256_LIMBS) && 802 (bn_get_top(generator->Y) == P256_LIMBS) && 803 is_equal(bn_get_words(generator->X), def_xG) && 804 is_equal(bn_get_words(generator->Y), def_yG) && 805 is_one(generator->Z); 806 } 807 808 __owur static int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx) 809 { 810 /* 811 * We precompute a table for a Booth encoded exponent (wNAF) based 812 * computation. Each table holds 64 values for safe access, with an 813 * implicit value of infinity at index zero. We use window of size 7, and 814 * therefore require ceil(256/7) = 37 tables. 815 */ 816 const BIGNUM *order; 817 EC_POINT *P = NULL, *T = NULL; 818 const EC_POINT *generator; 819 NISTZ256_PRE_COMP *pre_comp; 820 BN_CTX *new_ctx = NULL; 821 int i, j, k, ret = 0; 822 size_t w; 823 824 PRECOMP256_ROW *preComputedTable = NULL; 825 unsigned char *precomp_storage = NULL; 826 827 /* if there is an old NISTZ256_PRE_COMP object, throw it away */ 828 EC_pre_comp_free(group); 829 generator = EC_GROUP_get0_generator(group); 830 if (generator == NULL) { 831 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNDEFINED_GENERATOR); 832 return 0; 833 } 834 835 if (ecp_nistz256_is_affine_G(generator)) { 836 /* 837 * No need to calculate tables for the standard generator because we 838 * have them statically. 839 */ 840 return 1; 841 } 842 843 if ((pre_comp = ecp_nistz256_pre_comp_new(group)) == NULL) 844 return 0; 845 846 if (ctx == NULL) { 847 ctx = new_ctx = BN_CTX_new(); 848 if (ctx == NULL) 849 goto err; 850 } 851 852 BN_CTX_start(ctx); 853 854 order = EC_GROUP_get0_order(group); 855 if (order == NULL) 856 goto err; 857 858 if (BN_is_zero(order)) { 859 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNKNOWN_ORDER); 860 goto err; 861 } 862 863 w = 7; 864 865 if ((precomp_storage = 866 OPENSSL_malloc(37 * 64 * sizeof(P256_POINT_AFFINE) + 64)) == NULL) { 867 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, ERR_R_MALLOC_FAILURE); 868 goto err; 869 } 870 871 preComputedTable = (void *)ALIGNPTR(precomp_storage, 64); 872 873 P = EC_POINT_new(group); 874 T = EC_POINT_new(group); 875 if (P == NULL || T == NULL) 876 goto err; 877 878 /* 879 * The zero entry is implicitly infinity, and we skip it, storing other 880 * values with -1 offset. 881 */ 882 if (!EC_POINT_copy(T, generator)) 883 goto err; 884 885 for (k = 0; k < 64; k++) { 886 if (!EC_POINT_copy(P, T)) 887 goto err; 888 for (j = 0; j < 37; j++) { 889 P256_POINT_AFFINE temp; 890 /* 891 * It would be faster to use EC_POINTs_make_affine and 892 * make multiple points affine at the same time. 893 */ 894 if (!EC_POINT_make_affine(group, P, ctx)) 895 goto err; 896 if (!ecp_nistz256_bignum_to_field_elem(temp.X, P->X) || 897 !ecp_nistz256_bignum_to_field_elem(temp.Y, P->Y)) { 898 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, 899 EC_R_COORDINATES_OUT_OF_RANGE); 900 goto err; 901 } 902 ecp_nistz256_scatter_w7(preComputedTable[j], &temp, k); 903 for (i = 0; i < 7; i++) { 904 if (!EC_POINT_dbl(group, P, P, ctx)) 905 goto err; 906 } 907 } 908 if (!EC_POINT_add(group, T, T, generator, ctx)) 909 goto err; 910 } 911 912 pre_comp->group = group; 913 pre_comp->w = w; 914 pre_comp->precomp = preComputedTable; 915 pre_comp->precomp_storage = precomp_storage; 916 precomp_storage = NULL; 917 SETPRECOMP(group, nistz256, pre_comp); 918 pre_comp = NULL; 919 ret = 1; 920 921 err: 922 BN_CTX_end(ctx); 923 BN_CTX_free(new_ctx); 924 925 EC_nistz256_pre_comp_free(pre_comp); 926 OPENSSL_free(precomp_storage); 927 EC_POINT_free(P); 928 EC_POINT_free(T); 929 return ret; 930 } 931 932 /* 933 * Note that by default ECP_NISTZ256_AVX2 is undefined. While it's great 934 * code processing 4 points in parallel, corresponding serial operation 935 * is several times slower, because it uses 29x29=58-bit multiplication 936 * as opposite to 64x64=128-bit in integer-only scalar case. As result 937 * it doesn't provide *significant* performance improvement. Note that 938 * just defining ECP_NISTZ256_AVX2 is not sufficient to make it work, 939 * you'd need to compile even asm/ecp_nistz256-avx.pl module. 940 */ 941 #if defined(ECP_NISTZ256_AVX2) 942 # if !(defined(__x86_64) || defined(__x86_64__) || \ 943 defined(_M_AMD64) || defined(_M_X64)) || \ 944 !(defined(__GNUC__) || defined(_MSC_VER)) /* this is for ALIGN32 */ 945 # undef ECP_NISTZ256_AVX2 946 # else 947 /* Constant time access, loading four values, from four consecutive tables */ 948 void ecp_nistz256_avx2_multi_gather_w7(void *result, const void *in, 949 int index0, int index1, int index2, 950 int index3); 951 void ecp_nistz256_avx2_transpose_convert(void *RESULTx4, const void *in); 952 void ecp_nistz256_avx2_convert_transpose_back(void *result, const void *Ax4); 953 void ecp_nistz256_avx2_point_add_affine_x4(void *RESULTx4, const void *Ax4, 954 const void *Bx4); 955 void ecp_nistz256_avx2_point_add_affines_x4(void *RESULTx4, const void *Ax4, 956 const void *Bx4); 957 void ecp_nistz256_avx2_to_mont(void *RESULTx4, const void *Ax4); 958 void ecp_nistz256_avx2_from_mont(void *RESULTx4, const void *Ax4); 959 void ecp_nistz256_avx2_set1(void *RESULTx4); 960 int ecp_nistz_avx2_eligible(void); 961 962 static void booth_recode_w7(unsigned char *sign, 963 unsigned char *digit, unsigned char in) 964 { 965 unsigned char s, d; 966 967 s = ~((in >> 7) - 1); 968 d = (1 << 8) - in - 1; 969 d = (d & s) | (in & ~s); 970 d = (d >> 1) + (d & 1); 971 972 *sign = s & 1; 973 *digit = d; 974 } 975 976 /* 977 * ecp_nistz256_avx2_mul_g performs multiplication by G, using only the 978 * precomputed table. It does 4 affine point additions in parallel, 979 * significantly speeding up point multiplication for a fixed value. 980 */ 981 static void ecp_nistz256_avx2_mul_g(P256_POINT *r, 982 unsigned char p_str[33], 983 const P256_POINT_AFFINE(*preComputedTable)[64]) 984 { 985 const unsigned int window_size = 7; 986 const unsigned int mask = (1 << (window_size + 1)) - 1; 987 unsigned int wvalue; 988 /* Using 4 windows at a time */ 989 unsigned char sign0, digit0; 990 unsigned char sign1, digit1; 991 unsigned char sign2, digit2; 992 unsigned char sign3, digit3; 993 unsigned int idx = 0; 994 BN_ULONG tmp[P256_LIMBS]; 995 int i; 996 997 ALIGN32 BN_ULONG aX4[4 * 9 * 3] = { 0 }; 998 ALIGN32 BN_ULONG bX4[4 * 9 * 2] = { 0 }; 999 ALIGN32 P256_POINT_AFFINE point_arr[4]; 1000 ALIGN32 P256_POINT res_point_arr[4]; 1001 1002 /* Initial four windows */ 1003 wvalue = *((u16 *) & p_str[0]); 1004 wvalue = (wvalue << 1) & mask; 1005 idx += window_size; 1006 booth_recode_w7(&sign0, &digit0, wvalue); 1007 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1008 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1009 idx += window_size; 1010 booth_recode_w7(&sign1, &digit1, wvalue); 1011 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1012 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1013 idx += window_size; 1014 booth_recode_w7(&sign2, &digit2, wvalue); 1015 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1016 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1017 idx += window_size; 1018 booth_recode_w7(&sign3, &digit3, wvalue); 1019 1020 ecp_nistz256_avx2_multi_gather_w7(point_arr, preComputedTable[0], 1021 digit0, digit1, digit2, digit3); 1022 1023 ecp_nistz256_neg(tmp, point_arr[0].Y); 1024 copy_conditional(point_arr[0].Y, tmp, sign0); 1025 ecp_nistz256_neg(tmp, point_arr[1].Y); 1026 copy_conditional(point_arr[1].Y, tmp, sign1); 1027 ecp_nistz256_neg(tmp, point_arr[2].Y); 1028 copy_conditional(point_arr[2].Y, tmp, sign2); 1029 ecp_nistz256_neg(tmp, point_arr[3].Y); 1030 copy_conditional(point_arr[3].Y, tmp, sign3); 1031 1032 ecp_nistz256_avx2_transpose_convert(aX4, point_arr); 1033 ecp_nistz256_avx2_to_mont(aX4, aX4); 1034 ecp_nistz256_avx2_to_mont(&aX4[4 * 9], &aX4[4 * 9]); 1035 ecp_nistz256_avx2_set1(&aX4[4 * 9 * 2]); 1036 1037 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1038 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1039 idx += window_size; 1040 booth_recode_w7(&sign0, &digit0, wvalue); 1041 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1042 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1043 idx += window_size; 1044 booth_recode_w7(&sign1, &digit1, wvalue); 1045 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1046 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1047 idx += window_size; 1048 booth_recode_w7(&sign2, &digit2, wvalue); 1049 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1050 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1051 idx += window_size; 1052 booth_recode_w7(&sign3, &digit3, wvalue); 1053 1054 ecp_nistz256_avx2_multi_gather_w7(point_arr, preComputedTable[4 * 1], 1055 digit0, digit1, digit2, digit3); 1056 1057 ecp_nistz256_neg(tmp, point_arr[0].Y); 1058 copy_conditional(point_arr[0].Y, tmp, sign0); 1059 ecp_nistz256_neg(tmp, point_arr[1].Y); 1060 copy_conditional(point_arr[1].Y, tmp, sign1); 1061 ecp_nistz256_neg(tmp, point_arr[2].Y); 1062 copy_conditional(point_arr[2].Y, tmp, sign2); 1063 ecp_nistz256_neg(tmp, point_arr[3].Y); 1064 copy_conditional(point_arr[3].Y, tmp, sign3); 1065 1066 ecp_nistz256_avx2_transpose_convert(bX4, point_arr); 1067 ecp_nistz256_avx2_to_mont(bX4, bX4); 1068 ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]); 1069 /* Optimized when both inputs are affine */ 1070 ecp_nistz256_avx2_point_add_affines_x4(aX4, aX4, bX4); 1071 1072 for (i = 2; i < 9; i++) { 1073 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1074 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1075 idx += window_size; 1076 booth_recode_w7(&sign0, &digit0, wvalue); 1077 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1078 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1079 idx += window_size; 1080 booth_recode_w7(&sign1, &digit1, wvalue); 1081 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1082 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1083 idx += window_size; 1084 booth_recode_w7(&sign2, &digit2, wvalue); 1085 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1086 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1087 idx += window_size; 1088 booth_recode_w7(&sign3, &digit3, wvalue); 1089 1090 ecp_nistz256_avx2_multi_gather_w7(point_arr, 1091 preComputedTable[4 * i], 1092 digit0, digit1, digit2, digit3); 1093 1094 ecp_nistz256_neg(tmp, point_arr[0].Y); 1095 copy_conditional(point_arr[0].Y, tmp, sign0); 1096 ecp_nistz256_neg(tmp, point_arr[1].Y); 1097 copy_conditional(point_arr[1].Y, tmp, sign1); 1098 ecp_nistz256_neg(tmp, point_arr[2].Y); 1099 copy_conditional(point_arr[2].Y, tmp, sign2); 1100 ecp_nistz256_neg(tmp, point_arr[3].Y); 1101 copy_conditional(point_arr[3].Y, tmp, sign3); 1102 1103 ecp_nistz256_avx2_transpose_convert(bX4, point_arr); 1104 ecp_nistz256_avx2_to_mont(bX4, bX4); 1105 ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]); 1106 1107 ecp_nistz256_avx2_point_add_affine_x4(aX4, aX4, bX4); 1108 } 1109 1110 ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 0], &aX4[4 * 9 * 0]); 1111 ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 1], &aX4[4 * 9 * 1]); 1112 ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 2], &aX4[4 * 9 * 2]); 1113 1114 ecp_nistz256_avx2_convert_transpose_back(res_point_arr, aX4); 1115 /* Last window is performed serially */ 1116 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1117 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1118 booth_recode_w7(&sign0, &digit0, wvalue); 1119 ecp_nistz256_gather_w7((P256_POINT_AFFINE *)r, 1120 preComputedTable[36], digit0); 1121 ecp_nistz256_neg(tmp, r->Y); 1122 copy_conditional(r->Y, tmp, sign0); 1123 memcpy(r->Z, ONE, sizeof(ONE)); 1124 /* Sum the four windows */ 1125 ecp_nistz256_point_add(r, r, &res_point_arr[0]); 1126 ecp_nistz256_point_add(r, r, &res_point_arr[1]); 1127 ecp_nistz256_point_add(r, r, &res_point_arr[2]); 1128 ecp_nistz256_point_add(r, r, &res_point_arr[3]); 1129 } 1130 # endif 1131 #endif 1132 1133 __owur static int ecp_nistz256_set_from_affine(EC_POINT *out, const EC_GROUP *group, 1134 const P256_POINT_AFFINE *in, 1135 BN_CTX *ctx) 1136 { 1137 int ret = 0; 1138 1139 if ((ret = bn_set_words(out->X, in->X, P256_LIMBS)) 1140 && (ret = bn_set_words(out->Y, in->Y, P256_LIMBS)) 1141 && (ret = bn_set_words(out->Z, ONE, P256_LIMBS))) 1142 out->Z_is_one = 1; 1143 1144 return ret; 1145 } 1146 1147 /* r = scalar*G + sum(scalars[i]*points[i]) */ 1148 __owur static int ecp_nistz256_points_mul(const EC_GROUP *group, 1149 EC_POINT *r, 1150 const BIGNUM *scalar, 1151 size_t num, 1152 const EC_POINT *points[], 1153 const BIGNUM *scalars[], BN_CTX *ctx) 1154 { 1155 int i = 0, ret = 0, no_precomp_for_generator = 0, p_is_infinity = 0; 1156 unsigned char p_str[33] = { 0 }; 1157 const PRECOMP256_ROW *preComputedTable = NULL; 1158 const NISTZ256_PRE_COMP *pre_comp = NULL; 1159 const EC_POINT *generator = NULL; 1160 const BIGNUM **new_scalars = NULL; 1161 const EC_POINT **new_points = NULL; 1162 unsigned int idx = 0; 1163 const unsigned int window_size = 7; 1164 const unsigned int mask = (1 << (window_size + 1)) - 1; 1165 unsigned int wvalue; 1166 ALIGN32 union { 1167 P256_POINT p; 1168 P256_POINT_AFFINE a; 1169 } t, p; 1170 BIGNUM *tmp_scalar; 1171 1172 if ((num + 1) == 0 || (num + 1) > OPENSSL_MALLOC_MAX_NELEMS(void *)) { 1173 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 1174 return 0; 1175 } 1176 1177 BN_CTX_start(ctx); 1178 1179 if (scalar) { 1180 generator = EC_GROUP_get0_generator(group); 1181 if (generator == NULL) { 1182 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_UNDEFINED_GENERATOR); 1183 goto err; 1184 } 1185 1186 /* look if we can use precomputed multiples of generator */ 1187 pre_comp = group->pre_comp.nistz256; 1188 1189 if (pre_comp) { 1190 /* 1191 * If there is a precomputed table for the generator, check that 1192 * it was generated with the same generator. 1193 */ 1194 EC_POINT *pre_comp_generator = EC_POINT_new(group); 1195 if (pre_comp_generator == NULL) 1196 goto err; 1197 1198 ecp_nistz256_gather_w7(&p.a, pre_comp->precomp[0], 1); 1199 if (!ecp_nistz256_set_from_affine(pre_comp_generator, 1200 group, &p.a, ctx)) { 1201 EC_POINT_free(pre_comp_generator); 1202 goto err; 1203 } 1204 1205 if (0 == EC_POINT_cmp(group, generator, pre_comp_generator, ctx)) 1206 preComputedTable = (const PRECOMP256_ROW *)pre_comp->precomp; 1207 1208 EC_POINT_free(pre_comp_generator); 1209 } 1210 1211 if (preComputedTable == NULL && ecp_nistz256_is_affine_G(generator)) { 1212 /* 1213 * If there is no precomputed data, but the generator is the 1214 * default, a hardcoded table of precomputed data is used. This 1215 * is because applications, such as Apache, do not use 1216 * EC_KEY_precompute_mult. 1217 */ 1218 preComputedTable = ecp_nistz256_precomputed; 1219 } 1220 1221 if (preComputedTable) { 1222 if ((BN_num_bits(scalar) > 256) 1223 || BN_is_negative(scalar)) { 1224 if ((tmp_scalar = BN_CTX_get(ctx)) == NULL) 1225 goto err; 1226 1227 if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) { 1228 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_BN_LIB); 1229 goto err; 1230 } 1231 scalar = tmp_scalar; 1232 } 1233 1234 for (i = 0; i < bn_get_top(scalar) * BN_BYTES; i += BN_BYTES) { 1235 BN_ULONG d = bn_get_words(scalar)[i / BN_BYTES]; 1236 1237 p_str[i + 0] = (unsigned char)d; 1238 p_str[i + 1] = (unsigned char)(d >> 8); 1239 p_str[i + 2] = (unsigned char)(d >> 16); 1240 p_str[i + 3] = (unsigned char)(d >>= 24); 1241 if (BN_BYTES == 8) { 1242 d >>= 8; 1243 p_str[i + 4] = (unsigned char)d; 1244 p_str[i + 5] = (unsigned char)(d >> 8); 1245 p_str[i + 6] = (unsigned char)(d >> 16); 1246 p_str[i + 7] = (unsigned char)(d >> 24); 1247 } 1248 } 1249 1250 for (; i < 33; i++) 1251 p_str[i] = 0; 1252 1253 #if defined(ECP_NISTZ256_AVX2) 1254 if (ecp_nistz_avx2_eligible()) { 1255 ecp_nistz256_avx2_mul_g(&p.p, p_str, preComputedTable); 1256 } else 1257 #endif 1258 { 1259 BN_ULONG infty; 1260 1261 /* First window */ 1262 wvalue = (p_str[0] << 1) & mask; 1263 idx += window_size; 1264 1265 wvalue = _booth_recode_w7(wvalue); 1266 1267 ecp_nistz256_gather_w7(&p.a, preComputedTable[0], 1268 wvalue >> 1); 1269 1270 ecp_nistz256_neg(p.p.Z, p.p.Y); 1271 copy_conditional(p.p.Y, p.p.Z, wvalue & 1); 1272 1273 /* 1274 * Since affine infinity is encoded as (0,0) and 1275 * Jacobian ias (,,0), we need to harmonize them 1276 * by assigning "one" or zero to Z. 1277 */ 1278 infty = (p.p.X[0] | p.p.X[1] | p.p.X[2] | p.p.X[3] | 1279 p.p.Y[0] | p.p.Y[1] | p.p.Y[2] | p.p.Y[3]); 1280 if (P256_LIMBS == 8) 1281 infty |= (p.p.X[4] | p.p.X[5] | p.p.X[6] | p.p.X[7] | 1282 p.p.Y[4] | p.p.Y[5] | p.p.Y[6] | p.p.Y[7]); 1283 1284 infty = 0 - is_zero(infty); 1285 infty = ~infty; 1286 1287 p.p.Z[0] = ONE[0] & infty; 1288 p.p.Z[1] = ONE[1] & infty; 1289 p.p.Z[2] = ONE[2] & infty; 1290 p.p.Z[3] = ONE[3] & infty; 1291 if (P256_LIMBS == 8) { 1292 p.p.Z[4] = ONE[4] & infty; 1293 p.p.Z[5] = ONE[5] & infty; 1294 p.p.Z[6] = ONE[6] & infty; 1295 p.p.Z[7] = ONE[7] & infty; 1296 } 1297 1298 for (i = 1; i < 37; i++) { 1299 unsigned int off = (idx - 1) / 8; 1300 wvalue = p_str[off] | p_str[off + 1] << 8; 1301 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1302 idx += window_size; 1303 1304 wvalue = _booth_recode_w7(wvalue); 1305 1306 ecp_nistz256_gather_w7(&t.a, 1307 preComputedTable[i], wvalue >> 1); 1308 1309 ecp_nistz256_neg(t.p.Z, t.a.Y); 1310 copy_conditional(t.a.Y, t.p.Z, wvalue & 1); 1311 1312 ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a); 1313 } 1314 } 1315 } else { 1316 p_is_infinity = 1; 1317 no_precomp_for_generator = 1; 1318 } 1319 } else 1320 p_is_infinity = 1; 1321 1322 if (no_precomp_for_generator) { 1323 /* 1324 * Without a precomputed table for the generator, it has to be 1325 * handled like a normal point. 1326 */ 1327 new_scalars = OPENSSL_malloc((num + 1) * sizeof(BIGNUM *)); 1328 if (new_scalars == NULL) { 1329 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 1330 goto err; 1331 } 1332 1333 new_points = OPENSSL_malloc((num + 1) * sizeof(EC_POINT *)); 1334 if (new_points == NULL) { 1335 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 1336 goto err; 1337 } 1338 1339 memcpy(new_scalars, scalars, num * sizeof(BIGNUM *)); 1340 new_scalars[num] = scalar; 1341 memcpy(new_points, points, num * sizeof(EC_POINT *)); 1342 new_points[num] = generator; 1343 1344 scalars = new_scalars; 1345 points = new_points; 1346 num++; 1347 } 1348 1349 if (num) { 1350 P256_POINT *out = &t.p; 1351 if (p_is_infinity) 1352 out = &p.p; 1353 1354 if (!ecp_nistz256_windowed_mul(group, out, scalars, points, num, ctx)) 1355 goto err; 1356 1357 if (!p_is_infinity) 1358 ecp_nistz256_point_add(&p.p, &p.p, out); 1359 } 1360 1361 /* Not constant-time, but we're only operating on the public output. */ 1362 if (!bn_set_words(r->X, p.p.X, P256_LIMBS) || 1363 !bn_set_words(r->Y, p.p.Y, P256_LIMBS) || 1364 !bn_set_words(r->Z, p.p.Z, P256_LIMBS)) { 1365 goto err; 1366 } 1367 r->Z_is_one = is_one(r->Z) & 1; 1368 1369 ret = 1; 1370 1371 err: 1372 BN_CTX_end(ctx); 1373 OPENSSL_free(new_points); 1374 OPENSSL_free(new_scalars); 1375 return ret; 1376 } 1377 1378 __owur static int ecp_nistz256_get_affine(const EC_GROUP *group, 1379 const EC_POINT *point, 1380 BIGNUM *x, BIGNUM *y, BN_CTX *ctx) 1381 { 1382 BN_ULONG z_inv2[P256_LIMBS]; 1383 BN_ULONG z_inv3[P256_LIMBS]; 1384 BN_ULONG x_aff[P256_LIMBS]; 1385 BN_ULONG y_aff[P256_LIMBS]; 1386 BN_ULONG point_x[P256_LIMBS], point_y[P256_LIMBS], point_z[P256_LIMBS]; 1387 BN_ULONG x_ret[P256_LIMBS], y_ret[P256_LIMBS]; 1388 1389 if (EC_POINT_is_at_infinity(group, point)) { 1390 ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_POINT_AT_INFINITY); 1391 return 0; 1392 } 1393 1394 if (!ecp_nistz256_bignum_to_field_elem(point_x, point->X) || 1395 !ecp_nistz256_bignum_to_field_elem(point_y, point->Y) || 1396 !ecp_nistz256_bignum_to_field_elem(point_z, point->Z)) { 1397 ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_COORDINATES_OUT_OF_RANGE); 1398 return 0; 1399 } 1400 1401 ecp_nistz256_mod_inverse(z_inv3, point_z); 1402 ecp_nistz256_sqr_mont(z_inv2, z_inv3); 1403 ecp_nistz256_mul_mont(x_aff, z_inv2, point_x); 1404 1405 if (x != NULL) { 1406 ecp_nistz256_from_mont(x_ret, x_aff); 1407 if (!bn_set_words(x, x_ret, P256_LIMBS)) 1408 return 0; 1409 } 1410 1411 if (y != NULL) { 1412 ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2); 1413 ecp_nistz256_mul_mont(y_aff, z_inv3, point_y); 1414 ecp_nistz256_from_mont(y_ret, y_aff); 1415 if (!bn_set_words(y, y_ret, P256_LIMBS)) 1416 return 0; 1417 } 1418 1419 return 1; 1420 } 1421 1422 static NISTZ256_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group) 1423 { 1424 NISTZ256_PRE_COMP *ret = NULL; 1425 1426 if (!group) 1427 return NULL; 1428 1429 ret = OPENSSL_zalloc(sizeof(*ret)); 1430 1431 if (ret == NULL) { 1432 ECerr(EC_F_ECP_NISTZ256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); 1433 return ret; 1434 } 1435 1436 ret->group = group; 1437 ret->w = 6; /* default */ 1438 ret->references = 1; 1439 1440 ret->lock = CRYPTO_THREAD_lock_new(); 1441 if (ret->lock == NULL) { 1442 ECerr(EC_F_ECP_NISTZ256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); 1443 OPENSSL_free(ret); 1444 return NULL; 1445 } 1446 return ret; 1447 } 1448 1449 NISTZ256_PRE_COMP *EC_nistz256_pre_comp_dup(NISTZ256_PRE_COMP *p) 1450 { 1451 int i; 1452 if (p != NULL) 1453 CRYPTO_UP_REF(&p->references, &i, p->lock); 1454 return p; 1455 } 1456 1457 void EC_nistz256_pre_comp_free(NISTZ256_PRE_COMP *pre) 1458 { 1459 int i; 1460 1461 if (pre == NULL) 1462 return; 1463 1464 CRYPTO_DOWN_REF(&pre->references, &i, pre->lock); 1465 REF_PRINT_COUNT("EC_nistz256", x); 1466 if (i > 0) 1467 return; 1468 REF_ASSERT_ISNT(i < 0); 1469 1470 OPENSSL_free(pre->precomp_storage); 1471 CRYPTO_THREAD_lock_free(pre->lock); 1472 OPENSSL_free(pre); 1473 } 1474 1475 1476 static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group) 1477 { 1478 /* There is a hard-coded table for the default generator. */ 1479 const EC_POINT *generator = EC_GROUP_get0_generator(group); 1480 1481 if (generator != NULL && ecp_nistz256_is_affine_G(generator)) { 1482 /* There is a hard-coded table for the default generator. */ 1483 return 1; 1484 } 1485 1486 return HAVEPRECOMP(group, nistz256); 1487 } 1488 1489 #if defined(__x86_64) || defined(__x86_64__) || \ 1490 defined(_M_AMD64) || defined(_M_X64) || \ 1491 defined(__powerpc64__) || defined(_ARCH_PP64) || \ 1492 defined(__aarch64__) 1493 /* 1494 * Montgomery mul modulo Order(P): res = a*b*2^-256 mod Order(P) 1495 */ 1496 void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS], 1497 const BN_ULONG a[P256_LIMBS], 1498 const BN_ULONG b[P256_LIMBS]); 1499 void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS], 1500 const BN_ULONG a[P256_LIMBS], 1501 int rep); 1502 1503 static int ecp_nistz256_inv_mod_ord(const EC_GROUP *group, BIGNUM *r, 1504 const BIGNUM *x, BN_CTX *ctx) 1505 { 1506 /* RR = 2^512 mod ord(p256) */ 1507 static const BN_ULONG RR[P256_LIMBS] = { 1508 TOBN(0x83244c95,0xbe79eea2), TOBN(0x4699799c,0x49bd6fa6), 1509 TOBN(0x2845b239,0x2b6bec59), TOBN(0x66e12d94,0xf3d95620) 1510 }; 1511 /* The constant 1 (unlike ONE that is one in Montgomery representation) */ 1512 static const BN_ULONG one[P256_LIMBS] = { 1513 TOBN(0,1), TOBN(0,0), TOBN(0,0), TOBN(0,0) 1514 }; 1515 /* 1516 * We don't use entry 0 in the table, so we omit it and address 1517 * with -1 offset. 1518 */ 1519 BN_ULONG table[15][P256_LIMBS]; 1520 BN_ULONG out[P256_LIMBS], t[P256_LIMBS]; 1521 int i, ret = 0; 1522 enum { 1523 i_1 = 0, i_10, i_11, i_101, i_111, i_1010, i_1111, 1524 i_10101, i_101010, i_101111, i_x6, i_x8, i_x16, i_x32 1525 }; 1526 1527 /* 1528 * Catch allocation failure early. 1529 */ 1530 if (bn_wexpand(r, P256_LIMBS) == NULL) { 1531 ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB); 1532 goto err; 1533 } 1534 1535 if ((BN_num_bits(x) > 256) || BN_is_negative(x)) { 1536 BIGNUM *tmp; 1537 1538 if ((tmp = BN_CTX_get(ctx)) == NULL 1539 || !BN_nnmod(tmp, x, group->order, ctx)) { 1540 ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB); 1541 goto err; 1542 } 1543 x = tmp; 1544 } 1545 1546 if (!ecp_nistz256_bignum_to_field_elem(t, x)) { 1547 ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, EC_R_COORDINATES_OUT_OF_RANGE); 1548 goto err; 1549 } 1550 1551 ecp_nistz256_ord_mul_mont(table[0], t, RR); 1552 #if 0 1553 /* 1554 * Original sparse-then-fixed-window algorithm, retained for reference. 1555 */ 1556 for (i = 2; i < 16; i += 2) { 1557 ecp_nistz256_ord_sqr_mont(table[i-1], table[i/2-1], 1); 1558 ecp_nistz256_ord_mul_mont(table[i], table[i-1], table[0]); 1559 } 1560 1561 /* 1562 * The top 128bit of the exponent are highly redudndant, so we 1563 * perform an optimized flow 1564 */ 1565 ecp_nistz256_ord_sqr_mont(t, table[15-1], 4); /* f0 */ 1566 ecp_nistz256_ord_mul_mont(t, t, table[15-1]); /* ff */ 1567 1568 ecp_nistz256_ord_sqr_mont(out, t, 8); /* ff00 */ 1569 ecp_nistz256_ord_mul_mont(out, out, t); /* ffff */ 1570 1571 ecp_nistz256_ord_sqr_mont(t, out, 16); /* ffff0000 */ 1572 ecp_nistz256_ord_mul_mont(t, t, out); /* ffffffff */ 1573 1574 ecp_nistz256_ord_sqr_mont(out, t, 64); /* ffffffff0000000000000000 */ 1575 ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffff */ 1576 1577 ecp_nistz256_ord_sqr_mont(out, out, 32); /* ffffffff00000000ffffffff00000000 */ 1578 ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffffffffffff */ 1579 1580 /* 1581 * The bottom 128 bit of the exponent are processed with fixed 4-bit window 1582 */ 1583 for(i = 0; i < 32; i++) { 1584 /* expLo - the low 128 bits of the exponent we use (ord(p256) - 2), 1585 * split into nibbles */ 1586 static const unsigned char expLo[32] = { 1587 0xb,0xc,0xe,0x6,0xf,0xa,0xa,0xd,0xa,0x7,0x1,0x7,0x9,0xe,0x8,0x4, 1588 0xf,0x3,0xb,0x9,0xc,0xa,0xc,0x2,0xf,0xc,0x6,0x3,0x2,0x5,0x4,0xf 1589 }; 1590 1591 ecp_nistz256_ord_sqr_mont(out, out, 4); 1592 /* The exponent is public, no need in constant-time access */ 1593 ecp_nistz256_ord_mul_mont(out, out, table[expLo[i]-1]); 1594 } 1595 #else 1596 /* 1597 * https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion 1598 * 1599 * Even though this code path spares 12 squarings, 4.5%, and 13 1600 * multiplications, 25%, on grand scale sign operation is not that 1601 * much faster, not more that 2%... 1602 */ 1603 1604 /* pre-calculate powers */ 1605 ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], 1); 1606 1607 ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]); 1608 1609 ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]); 1610 1611 ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]); 1612 1613 ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], 1); 1614 1615 ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]); 1616 1617 ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], 1); 1618 ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]); 1619 1620 ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], 1); 1621 1622 ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]); 1623 1624 ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]); 1625 1626 ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], 2); 1627 ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]); 1628 1629 ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], 8); 1630 ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]); 1631 1632 ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], 16); 1633 ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]); 1634 1635 /* calculations */ 1636 ecp_nistz256_ord_sqr_mont(out, table[i_x32], 64); 1637 ecp_nistz256_ord_mul_mont(out, out, table[i_x32]); 1638 1639 for (i = 0; i < 27; i++) { 1640 static const struct { unsigned char p, i; } chain[27] = { 1641 { 32, i_x32 }, { 6, i_101111 }, { 5, i_111 }, 1642 { 4, i_11 }, { 5, i_1111 }, { 5, i_10101 }, 1643 { 4, i_101 }, { 3, i_101 }, { 3, i_101 }, 1644 { 5, i_111 }, { 9, i_101111 }, { 6, i_1111 }, 1645 { 2, i_1 }, { 5, i_1 }, { 6, i_1111 }, 1646 { 5, i_111 }, { 4, i_111 }, { 5, i_111 }, 1647 { 5, i_101 }, { 3, i_11 }, { 10, i_101111 }, 1648 { 2, i_11 }, { 5, i_11 }, { 5, i_11 }, 1649 { 3, i_1 }, { 7, i_10101 }, { 6, i_1111 } 1650 }; 1651 1652 ecp_nistz256_ord_sqr_mont(out, out, chain[i].p); 1653 ecp_nistz256_ord_mul_mont(out, out, table[chain[i].i]); 1654 } 1655 #endif 1656 ecp_nistz256_ord_mul_mont(out, out, one); 1657 1658 /* 1659 * Can't fail, but check return code to be consistent anyway. 1660 */ 1661 if (!bn_set_words(r, out, P256_LIMBS)) 1662 goto err; 1663 1664 ret = 1; 1665 err: 1666 return ret; 1667 } 1668 #else 1669 # define ecp_nistz256_inv_mod_ord NULL 1670 #endif 1671 1672 const EC_METHOD *EC_GFp_nistz256_method(void) 1673 { 1674 static const EC_METHOD ret = { 1675 EC_FLAGS_DEFAULT_OCT, 1676 NID_X9_62_prime_field, 1677 ec_GFp_mont_group_init, 1678 ec_GFp_mont_group_finish, 1679 ec_GFp_mont_group_clear_finish, 1680 ec_GFp_mont_group_copy, 1681 ec_GFp_mont_group_set_curve, 1682 ec_GFp_simple_group_get_curve, 1683 ec_GFp_simple_group_get_degree, 1684 ec_group_simple_order_bits, 1685 ec_GFp_simple_group_check_discriminant, 1686 ec_GFp_simple_point_init, 1687 ec_GFp_simple_point_finish, 1688 ec_GFp_simple_point_clear_finish, 1689 ec_GFp_simple_point_copy, 1690 ec_GFp_simple_point_set_to_infinity, 1691 ec_GFp_simple_set_Jprojective_coordinates_GFp, 1692 ec_GFp_simple_get_Jprojective_coordinates_GFp, 1693 ec_GFp_simple_point_set_affine_coordinates, 1694 ecp_nistz256_get_affine, 1695 0, 0, 0, 1696 ec_GFp_simple_add, 1697 ec_GFp_simple_dbl, 1698 ec_GFp_simple_invert, 1699 ec_GFp_simple_is_at_infinity, 1700 ec_GFp_simple_is_on_curve, 1701 ec_GFp_simple_cmp, 1702 ec_GFp_simple_make_affine, 1703 ec_GFp_simple_points_make_affine, 1704 ecp_nistz256_points_mul, /* mul */ 1705 ecp_nistz256_mult_precompute, /* precompute_mult */ 1706 ecp_nistz256_window_have_precompute_mult, /* have_precompute_mult */ 1707 ec_GFp_mont_field_mul, 1708 ec_GFp_mont_field_sqr, 1709 0, /* field_div */ 1710 ec_GFp_mont_field_inv, 1711 ec_GFp_mont_field_encode, 1712 ec_GFp_mont_field_decode, 1713 ec_GFp_mont_field_set_to_one, 1714 ec_key_simple_priv2oct, 1715 ec_key_simple_oct2priv, 1716 0, /* set private */ 1717 ec_key_simple_generate_key, 1718 ec_key_simple_check_key, 1719 ec_key_simple_generate_public_key, 1720 0, /* keycopy */ 1721 0, /* keyfinish */ 1722 ecdh_simple_compute_key, 1723 ecp_nistz256_inv_mod_ord, /* can be #define-d NULL */ 1724 0, /* blind_coordinates */ 1725 0, /* ladder_pre */ 1726 0, /* ladder_step */ 1727 0 /* ladder_post */ 1728 }; 1729 1730 return &ret; 1731 } 1732