1 /* 2 * Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved. 3 * Copyright (c) 2014, Intel Corporation. All Rights Reserved. 4 * Copyright (c) 2015, CloudFlare, Inc. 5 * 6 * Licensed under the OpenSSL license (the "License"). You may not use 7 * this file except in compliance with the License. You can obtain a copy 8 * in the file LICENSE in the source distribution or at 9 * https://www.openssl.org/source/license.html 10 * 11 * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) 12 * (1) Intel Corporation, Israel Development Center, Haifa, Israel 13 * (2) University of Haifa, Israel 14 * (3) CloudFlare, Inc. 15 * 16 * Reference: 17 * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with 18 * 256 Bit Primes" 19 */ 20 21 #include <string.h> 22 23 #include "internal/cryptlib.h" 24 #include "internal/bn_int.h" 25 #include "ec_lcl.h" 26 #include "internal/refcount.h" 27 28 #if BN_BITS2 != 64 29 # define TOBN(hi,lo) lo,hi 30 #else 31 # define TOBN(hi,lo) ((BN_ULONG)hi<<32|lo) 32 #endif 33 34 #if defined(__GNUC__) 35 # define ALIGN32 __attribute((aligned(32))) 36 #elif defined(_MSC_VER) 37 # define ALIGN32 __declspec(align(32)) 38 #else 39 # define ALIGN32 40 #endif 41 42 #define ALIGNPTR(p,N) ((unsigned char *)p+N-(size_t)p%N) 43 #define P256_LIMBS (256/BN_BITS2) 44 45 typedef unsigned short u16; 46 47 typedef struct { 48 BN_ULONG X[P256_LIMBS]; 49 BN_ULONG Y[P256_LIMBS]; 50 BN_ULONG Z[P256_LIMBS]; 51 } P256_POINT; 52 53 typedef struct { 54 BN_ULONG X[P256_LIMBS]; 55 BN_ULONG Y[P256_LIMBS]; 56 } P256_POINT_AFFINE; 57 58 typedef P256_POINT_AFFINE PRECOMP256_ROW[64]; 59 60 /* structure for precomputed multiples of the generator */ 61 struct nistz256_pre_comp_st { 62 const EC_GROUP *group; /* Parent EC_GROUP object */ 63 size_t w; /* Window size */ 64 /* 65 * Constant time access to the X and Y coordinates of the pre-computed, 66 * generator multiplies, in the Montgomery domain. Pre-calculated 67 * multiplies are stored in affine form. 68 */ 69 PRECOMP256_ROW *precomp; 70 void *precomp_storage; 71 CRYPTO_REF_COUNT references; 72 CRYPTO_RWLOCK *lock; 73 }; 74 75 /* Functions implemented in assembly */ 76 /* 77 * Most of below mentioned functions *preserve* the property of inputs 78 * being fully reduced, i.e. being in [0, modulus) range. Simply put if 79 * inputs are fully reduced, then output is too. Note that reverse is 80 * not true, in sense that given partially reduced inputs output can be 81 * either, not unlikely reduced. And "most" in first sentence refers to 82 * the fact that given the calculations flow one can tolerate that 83 * addition, 1st function below, produces partially reduced result *if* 84 * multiplications by 2 and 3, which customarily use addition, fully 85 * reduce it. This effectively gives two options: a) addition produces 86 * fully reduced result [as long as inputs are, just like remaining 87 * functions]; b) addition is allowed to produce partially reduced 88 * result, but multiplications by 2 and 3 perform additional reduction 89 * step. Choice between the two can be platform-specific, but it was a) 90 * in all cases so far... 91 */ 92 /* Modular add: res = a+b mod P */ 93 void ecp_nistz256_add(BN_ULONG res[P256_LIMBS], 94 const BN_ULONG a[P256_LIMBS], 95 const BN_ULONG b[P256_LIMBS]); 96 /* Modular mul by 2: res = 2*a mod P */ 97 void ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS], 98 const BN_ULONG a[P256_LIMBS]); 99 /* Modular mul by 3: res = 3*a mod P */ 100 void ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS], 101 const BN_ULONG a[P256_LIMBS]); 102 103 /* Modular div by 2: res = a/2 mod P */ 104 void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS], 105 const BN_ULONG a[P256_LIMBS]); 106 /* Modular sub: res = a-b mod P */ 107 void ecp_nistz256_sub(BN_ULONG res[P256_LIMBS], 108 const BN_ULONG a[P256_LIMBS], 109 const BN_ULONG b[P256_LIMBS]); 110 /* Modular neg: res = -a mod P */ 111 void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); 112 /* Montgomery mul: res = a*b*2^-256 mod P */ 113 void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], 114 const BN_ULONG a[P256_LIMBS], 115 const BN_ULONG b[P256_LIMBS]); 116 /* Montgomery sqr: res = a*a*2^-256 mod P */ 117 void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], 118 const BN_ULONG a[P256_LIMBS]); 119 /* Convert a number from Montgomery domain, by multiplying with 1 */ 120 void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS], 121 const BN_ULONG in[P256_LIMBS]); 122 /* Convert a number to Montgomery domain, by multiplying with 2^512 mod P*/ 123 void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS], 124 const BN_ULONG in[P256_LIMBS]); 125 /* Functions that perform constant time access to the precomputed tables */ 126 void ecp_nistz256_scatter_w5(P256_POINT *val, 127 const P256_POINT *in_t, int idx); 128 void ecp_nistz256_gather_w5(P256_POINT *val, 129 const P256_POINT *in_t, int idx); 130 void ecp_nistz256_scatter_w7(P256_POINT_AFFINE *val, 131 const P256_POINT_AFFINE *in_t, int idx); 132 void ecp_nistz256_gather_w7(P256_POINT_AFFINE *val, 133 const P256_POINT_AFFINE *in_t, int idx); 134 135 /* One converted into the Montgomery domain */ 136 static const BN_ULONG ONE[P256_LIMBS] = { 137 TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000), 138 TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe) 139 }; 140 141 static NISTZ256_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group); 142 143 /* Precomputed tables for the default generator */ 144 extern const PRECOMP256_ROW ecp_nistz256_precomputed[37]; 145 146 /* Recode window to a signed digit, see ecp_nistputil.c for details */ 147 static unsigned int _booth_recode_w5(unsigned int in) 148 { 149 unsigned int s, d; 150 151 s = ~((in >> 5) - 1); 152 d = (1 << 6) - in - 1; 153 d = (d & s) | (in & ~s); 154 d = (d >> 1) + (d & 1); 155 156 return (d << 1) + (s & 1); 157 } 158 159 static unsigned int _booth_recode_w7(unsigned int in) 160 { 161 unsigned int s, d; 162 163 s = ~((in >> 7) - 1); 164 d = (1 << 8) - in - 1; 165 d = (d & s) | (in & ~s); 166 d = (d >> 1) + (d & 1); 167 168 return (d << 1) + (s & 1); 169 } 170 171 static void copy_conditional(BN_ULONG dst[P256_LIMBS], 172 const BN_ULONG src[P256_LIMBS], BN_ULONG move) 173 { 174 BN_ULONG mask1 = 0-move; 175 BN_ULONG mask2 = ~mask1; 176 177 dst[0] = (src[0] & mask1) ^ (dst[0] & mask2); 178 dst[1] = (src[1] & mask1) ^ (dst[1] & mask2); 179 dst[2] = (src[2] & mask1) ^ (dst[2] & mask2); 180 dst[3] = (src[3] & mask1) ^ (dst[3] & mask2); 181 if (P256_LIMBS == 8) { 182 dst[4] = (src[4] & mask1) ^ (dst[4] & mask2); 183 dst[5] = (src[5] & mask1) ^ (dst[5] & mask2); 184 dst[6] = (src[6] & mask1) ^ (dst[6] & mask2); 185 dst[7] = (src[7] & mask1) ^ (dst[7] & mask2); 186 } 187 } 188 189 static BN_ULONG is_zero(BN_ULONG in) 190 { 191 in |= (0 - in); 192 in = ~in; 193 in >>= BN_BITS2 - 1; 194 return in; 195 } 196 197 static BN_ULONG is_equal(const BN_ULONG a[P256_LIMBS], 198 const BN_ULONG b[P256_LIMBS]) 199 { 200 BN_ULONG res; 201 202 res = a[0] ^ b[0]; 203 res |= a[1] ^ b[1]; 204 res |= a[2] ^ b[2]; 205 res |= a[3] ^ b[3]; 206 if (P256_LIMBS == 8) { 207 res |= a[4] ^ b[4]; 208 res |= a[5] ^ b[5]; 209 res |= a[6] ^ b[6]; 210 res |= a[7] ^ b[7]; 211 } 212 213 return is_zero(res); 214 } 215 216 static BN_ULONG is_one(const BIGNUM *z) 217 { 218 BN_ULONG res = 0; 219 BN_ULONG *a = bn_get_words(z); 220 221 if (bn_get_top(z) == (P256_LIMBS - P256_LIMBS / 8)) { 222 res = a[0] ^ ONE[0]; 223 res |= a[1] ^ ONE[1]; 224 res |= a[2] ^ ONE[2]; 225 res |= a[3] ^ ONE[3]; 226 if (P256_LIMBS == 8) { 227 res |= a[4] ^ ONE[4]; 228 res |= a[5] ^ ONE[5]; 229 res |= a[6] ^ ONE[6]; 230 /* 231 * no check for a[7] (being zero) on 32-bit platforms, 232 * because value of "one" takes only 7 limbs. 233 */ 234 } 235 res = is_zero(res); 236 } 237 238 return res; 239 } 240 241 /* 242 * For reference, this macro is used only when new ecp_nistz256 assembly 243 * module is being developed. For example, configure with 244 * -DECP_NISTZ256_REFERENCE_IMPLEMENTATION and implement only functions 245 * performing simplest arithmetic operations on 256-bit vectors. Then 246 * work on implementation of higher-level functions performing point 247 * operations. Then remove ECP_NISTZ256_REFERENCE_IMPLEMENTATION 248 * and never define it again. (The correct macro denoting presence of 249 * ecp_nistz256 module is ECP_NISTZ256_ASM.) 250 */ 251 #ifndef ECP_NISTZ256_REFERENCE_IMPLEMENTATION 252 void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a); 253 void ecp_nistz256_point_add(P256_POINT *r, 254 const P256_POINT *a, const P256_POINT *b); 255 void ecp_nistz256_point_add_affine(P256_POINT *r, 256 const P256_POINT *a, 257 const P256_POINT_AFFINE *b); 258 #else 259 /* Point double: r = 2*a */ 260 static void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) 261 { 262 BN_ULONG S[P256_LIMBS]; 263 BN_ULONG M[P256_LIMBS]; 264 BN_ULONG Zsqr[P256_LIMBS]; 265 BN_ULONG tmp0[P256_LIMBS]; 266 267 const BN_ULONG *in_x = a->X; 268 const BN_ULONG *in_y = a->Y; 269 const BN_ULONG *in_z = a->Z; 270 271 BN_ULONG *res_x = r->X; 272 BN_ULONG *res_y = r->Y; 273 BN_ULONG *res_z = r->Z; 274 275 ecp_nistz256_mul_by_2(S, in_y); 276 277 ecp_nistz256_sqr_mont(Zsqr, in_z); 278 279 ecp_nistz256_sqr_mont(S, S); 280 281 ecp_nistz256_mul_mont(res_z, in_z, in_y); 282 ecp_nistz256_mul_by_2(res_z, res_z); 283 284 ecp_nistz256_add(M, in_x, Zsqr); 285 ecp_nistz256_sub(Zsqr, in_x, Zsqr); 286 287 ecp_nistz256_sqr_mont(res_y, S); 288 ecp_nistz256_div_by_2(res_y, res_y); 289 290 ecp_nistz256_mul_mont(M, M, Zsqr); 291 ecp_nistz256_mul_by_3(M, M); 292 293 ecp_nistz256_mul_mont(S, S, in_x); 294 ecp_nistz256_mul_by_2(tmp0, S); 295 296 ecp_nistz256_sqr_mont(res_x, M); 297 298 ecp_nistz256_sub(res_x, res_x, tmp0); 299 ecp_nistz256_sub(S, S, res_x); 300 301 ecp_nistz256_mul_mont(S, S, M); 302 ecp_nistz256_sub(res_y, S, res_y); 303 } 304 305 /* Point addition: r = a+b */ 306 static void ecp_nistz256_point_add(P256_POINT *r, 307 const P256_POINT *a, const P256_POINT *b) 308 { 309 BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS]; 310 BN_ULONG U1[P256_LIMBS], S1[P256_LIMBS]; 311 BN_ULONG Z1sqr[P256_LIMBS]; 312 BN_ULONG Z2sqr[P256_LIMBS]; 313 BN_ULONG H[P256_LIMBS], R[P256_LIMBS]; 314 BN_ULONG Hsqr[P256_LIMBS]; 315 BN_ULONG Rsqr[P256_LIMBS]; 316 BN_ULONG Hcub[P256_LIMBS]; 317 318 BN_ULONG res_x[P256_LIMBS]; 319 BN_ULONG res_y[P256_LIMBS]; 320 BN_ULONG res_z[P256_LIMBS]; 321 322 BN_ULONG in1infty, in2infty; 323 324 const BN_ULONG *in1_x = a->X; 325 const BN_ULONG *in1_y = a->Y; 326 const BN_ULONG *in1_z = a->Z; 327 328 const BN_ULONG *in2_x = b->X; 329 const BN_ULONG *in2_y = b->Y; 330 const BN_ULONG *in2_z = b->Z; 331 332 /* 333 * Infinity in encoded as (,,0) 334 */ 335 in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]); 336 if (P256_LIMBS == 8) 337 in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]); 338 339 in2infty = (in2_z[0] | in2_z[1] | in2_z[2] | in2_z[3]); 340 if (P256_LIMBS == 8) 341 in2infty |= (in2_z[4] | in2_z[5] | in2_z[6] | in2_z[7]); 342 343 in1infty = is_zero(in1infty); 344 in2infty = is_zero(in2infty); 345 346 ecp_nistz256_sqr_mont(Z2sqr, in2_z); /* Z2^2 */ 347 ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ 348 349 ecp_nistz256_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */ 350 ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ 351 352 ecp_nistz256_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */ 353 ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ 354 ecp_nistz256_sub(R, S2, S1); /* R = S2 - S1 */ 355 356 ecp_nistz256_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */ 357 ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ 358 ecp_nistz256_sub(H, U2, U1); /* H = U2 - U1 */ 359 360 /* 361 * This should not happen during sign/ecdh, so no constant time violation 362 */ 363 if (is_equal(U1, U2) && !in1infty && !in2infty) { 364 if (is_equal(S1, S2)) { 365 ecp_nistz256_point_double(r, a); 366 return; 367 } else { 368 memset(r, 0, sizeof(*r)); 369 return; 370 } 371 } 372 373 ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */ 374 ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ 375 ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */ 376 ecp_nistz256_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */ 377 ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */ 378 379 ecp_nistz256_mul_mont(U2, U1, Hsqr); /* U1*H^2 */ 380 ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ 381 382 ecp_nistz256_sub(res_x, Rsqr, Hsqr); 383 ecp_nistz256_sub(res_x, res_x, Hcub); 384 385 ecp_nistz256_sub(res_y, U2, res_x); 386 387 ecp_nistz256_mul_mont(S2, S1, Hcub); 388 ecp_nistz256_mul_mont(res_y, R, res_y); 389 ecp_nistz256_sub(res_y, res_y, S2); 390 391 copy_conditional(res_x, in2_x, in1infty); 392 copy_conditional(res_y, in2_y, in1infty); 393 copy_conditional(res_z, in2_z, in1infty); 394 395 copy_conditional(res_x, in1_x, in2infty); 396 copy_conditional(res_y, in1_y, in2infty); 397 copy_conditional(res_z, in1_z, in2infty); 398 399 memcpy(r->X, res_x, sizeof(res_x)); 400 memcpy(r->Y, res_y, sizeof(res_y)); 401 memcpy(r->Z, res_z, sizeof(res_z)); 402 } 403 404 /* Point addition when b is known to be affine: r = a+b */ 405 static void ecp_nistz256_point_add_affine(P256_POINT *r, 406 const P256_POINT *a, 407 const P256_POINT_AFFINE *b) 408 { 409 BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS]; 410 BN_ULONG Z1sqr[P256_LIMBS]; 411 BN_ULONG H[P256_LIMBS], R[P256_LIMBS]; 412 BN_ULONG Hsqr[P256_LIMBS]; 413 BN_ULONG Rsqr[P256_LIMBS]; 414 BN_ULONG Hcub[P256_LIMBS]; 415 416 BN_ULONG res_x[P256_LIMBS]; 417 BN_ULONG res_y[P256_LIMBS]; 418 BN_ULONG res_z[P256_LIMBS]; 419 420 BN_ULONG in1infty, in2infty; 421 422 const BN_ULONG *in1_x = a->X; 423 const BN_ULONG *in1_y = a->Y; 424 const BN_ULONG *in1_z = a->Z; 425 426 const BN_ULONG *in2_x = b->X; 427 const BN_ULONG *in2_y = b->Y; 428 429 /* 430 * Infinity in encoded as (,,0) 431 */ 432 in1infty = (in1_z[0] | in1_z[1] | in1_z[2] | in1_z[3]); 433 if (P256_LIMBS == 8) 434 in1infty |= (in1_z[4] | in1_z[5] | in1_z[6] | in1_z[7]); 435 436 /* 437 * In affine representation we encode infinity as (0,0), which is 438 * not on the curve, so it is OK 439 */ 440 in2infty = (in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] | 441 in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3]); 442 if (P256_LIMBS == 8) 443 in2infty |= (in2_x[4] | in2_x[5] | in2_x[6] | in2_x[7] | 444 in2_y[4] | in2_y[5] | in2_y[6] | in2_y[7]); 445 446 in1infty = is_zero(in1infty); 447 in2infty = is_zero(in2infty); 448 449 ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */ 450 451 ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */ 452 ecp_nistz256_sub(H, U2, in1_x); /* H = U2 - U1 */ 453 454 ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */ 455 456 ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */ 457 458 ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */ 459 ecp_nistz256_sub(R, S2, in1_y); /* R = S2 - S1 */ 460 461 ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */ 462 ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */ 463 ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */ 464 465 ecp_nistz256_mul_mont(U2, in1_x, Hsqr); /* U1*H^2 */ 466 ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */ 467 468 ecp_nistz256_sub(res_x, Rsqr, Hsqr); 469 ecp_nistz256_sub(res_x, res_x, Hcub); 470 ecp_nistz256_sub(H, U2, res_x); 471 472 ecp_nistz256_mul_mont(S2, in1_y, Hcub); 473 ecp_nistz256_mul_mont(H, H, R); 474 ecp_nistz256_sub(res_y, H, S2); 475 476 copy_conditional(res_x, in2_x, in1infty); 477 copy_conditional(res_x, in1_x, in2infty); 478 479 copy_conditional(res_y, in2_y, in1infty); 480 copy_conditional(res_y, in1_y, in2infty); 481 482 copy_conditional(res_z, ONE, in1infty); 483 copy_conditional(res_z, in1_z, in2infty); 484 485 memcpy(r->X, res_x, sizeof(res_x)); 486 memcpy(r->Y, res_y, sizeof(res_y)); 487 memcpy(r->Z, res_z, sizeof(res_z)); 488 } 489 #endif 490 491 /* r = in^-1 mod p */ 492 static void ecp_nistz256_mod_inverse(BN_ULONG r[P256_LIMBS], 493 const BN_ULONG in[P256_LIMBS]) 494 { 495 /* 496 * The poly is ffffffff 00000001 00000000 00000000 00000000 ffffffff 497 * ffffffff ffffffff We use FLT and used poly-2 as exponent 498 */ 499 BN_ULONG p2[P256_LIMBS]; 500 BN_ULONG p4[P256_LIMBS]; 501 BN_ULONG p8[P256_LIMBS]; 502 BN_ULONG p16[P256_LIMBS]; 503 BN_ULONG p32[P256_LIMBS]; 504 BN_ULONG res[P256_LIMBS]; 505 int i; 506 507 ecp_nistz256_sqr_mont(res, in); 508 ecp_nistz256_mul_mont(p2, res, in); /* 3*p */ 509 510 ecp_nistz256_sqr_mont(res, p2); 511 ecp_nistz256_sqr_mont(res, res); 512 ecp_nistz256_mul_mont(p4, res, p2); /* f*p */ 513 514 ecp_nistz256_sqr_mont(res, p4); 515 ecp_nistz256_sqr_mont(res, res); 516 ecp_nistz256_sqr_mont(res, res); 517 ecp_nistz256_sqr_mont(res, res); 518 ecp_nistz256_mul_mont(p8, res, p4); /* ff*p */ 519 520 ecp_nistz256_sqr_mont(res, p8); 521 for (i = 0; i < 7; i++) 522 ecp_nistz256_sqr_mont(res, res); 523 ecp_nistz256_mul_mont(p16, res, p8); /* ffff*p */ 524 525 ecp_nistz256_sqr_mont(res, p16); 526 for (i = 0; i < 15; i++) 527 ecp_nistz256_sqr_mont(res, res); 528 ecp_nistz256_mul_mont(p32, res, p16); /* ffffffff*p */ 529 530 ecp_nistz256_sqr_mont(res, p32); 531 for (i = 0; i < 31; i++) 532 ecp_nistz256_sqr_mont(res, res); 533 ecp_nistz256_mul_mont(res, res, in); 534 535 for (i = 0; i < 32 * 4; i++) 536 ecp_nistz256_sqr_mont(res, res); 537 ecp_nistz256_mul_mont(res, res, p32); 538 539 for (i = 0; i < 32; i++) 540 ecp_nistz256_sqr_mont(res, res); 541 ecp_nistz256_mul_mont(res, res, p32); 542 543 for (i = 0; i < 16; i++) 544 ecp_nistz256_sqr_mont(res, res); 545 ecp_nistz256_mul_mont(res, res, p16); 546 547 for (i = 0; i < 8; i++) 548 ecp_nistz256_sqr_mont(res, res); 549 ecp_nistz256_mul_mont(res, res, p8); 550 551 ecp_nistz256_sqr_mont(res, res); 552 ecp_nistz256_sqr_mont(res, res); 553 ecp_nistz256_sqr_mont(res, res); 554 ecp_nistz256_sqr_mont(res, res); 555 ecp_nistz256_mul_mont(res, res, p4); 556 557 ecp_nistz256_sqr_mont(res, res); 558 ecp_nistz256_sqr_mont(res, res); 559 ecp_nistz256_mul_mont(res, res, p2); 560 561 ecp_nistz256_sqr_mont(res, res); 562 ecp_nistz256_sqr_mont(res, res); 563 ecp_nistz256_mul_mont(res, res, in); 564 565 memcpy(r, res, sizeof(res)); 566 } 567 568 /* 569 * ecp_nistz256_bignum_to_field_elem copies the contents of |in| to |out| and 570 * returns one if it fits. Otherwise it returns zero. 571 */ 572 __owur static int ecp_nistz256_bignum_to_field_elem(BN_ULONG out[P256_LIMBS], 573 const BIGNUM *in) 574 { 575 return bn_copy_words(out, in, P256_LIMBS); 576 } 577 578 /* r = sum(scalar[i]*point[i]) */ 579 __owur static int ecp_nistz256_windowed_mul(const EC_GROUP *group, 580 P256_POINT *r, 581 const BIGNUM **scalar, 582 const EC_POINT **point, 583 size_t num, BN_CTX *ctx) 584 { 585 size_t i; 586 int j, ret = 0; 587 unsigned int idx; 588 unsigned char (*p_str)[33] = NULL; 589 const unsigned int window_size = 5; 590 const unsigned int mask = (1 << (window_size + 1)) - 1; 591 unsigned int wvalue; 592 P256_POINT *temp; /* place for 5 temporary points */ 593 const BIGNUM **scalars = NULL; 594 P256_POINT (*table)[16] = NULL; 595 void *table_storage = NULL; 596 597 if ((num * 16 + 6) > OPENSSL_MALLOC_MAX_NELEMS(P256_POINT) 598 || (table_storage = 599 OPENSSL_malloc((num * 16 + 5) * sizeof(P256_POINT) + 64)) == NULL 600 || (p_str = 601 OPENSSL_malloc(num * 33 * sizeof(unsigned char))) == NULL 602 || (scalars = OPENSSL_malloc(num * sizeof(BIGNUM *))) == NULL) { 603 ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_MALLOC_FAILURE); 604 goto err; 605 } 606 607 table = (void *)ALIGNPTR(table_storage, 64); 608 temp = (P256_POINT *)(table + num); 609 610 for (i = 0; i < num; i++) { 611 P256_POINT *row = table[i]; 612 613 /* This is an unusual input, we don't guarantee constant-timeness. */ 614 if ((BN_num_bits(scalar[i]) > 256) || BN_is_negative(scalar[i])) { 615 BIGNUM *mod; 616 617 if ((mod = BN_CTX_get(ctx)) == NULL) 618 goto err; 619 if (!BN_nnmod(mod, scalar[i], group->order, ctx)) { 620 ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_BN_LIB); 621 goto err; 622 } 623 scalars[i] = mod; 624 } else 625 scalars[i] = scalar[i]; 626 627 for (j = 0; j < bn_get_top(scalars[i]) * BN_BYTES; j += BN_BYTES) { 628 BN_ULONG d = bn_get_words(scalars[i])[j / BN_BYTES]; 629 630 p_str[i][j + 0] = (unsigned char)d; 631 p_str[i][j + 1] = (unsigned char)(d >> 8); 632 p_str[i][j + 2] = (unsigned char)(d >> 16); 633 p_str[i][j + 3] = (unsigned char)(d >>= 24); 634 if (BN_BYTES == 8) { 635 d >>= 8; 636 p_str[i][j + 4] = (unsigned char)d; 637 p_str[i][j + 5] = (unsigned char)(d >> 8); 638 p_str[i][j + 6] = (unsigned char)(d >> 16); 639 p_str[i][j + 7] = (unsigned char)(d >> 24); 640 } 641 } 642 for (; j < 33; j++) 643 p_str[i][j] = 0; 644 645 if (!ecp_nistz256_bignum_to_field_elem(temp[0].X, point[i]->X) 646 || !ecp_nistz256_bignum_to_field_elem(temp[0].Y, point[i]->Y) 647 || !ecp_nistz256_bignum_to_field_elem(temp[0].Z, point[i]->Z)) { 648 ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, 649 EC_R_COORDINATES_OUT_OF_RANGE); 650 goto err; 651 } 652 653 /* 654 * row[0] is implicitly (0,0,0) (the point at infinity), therefore it 655 * is not stored. All other values are actually stored with an offset 656 * of -1 in table. 657 */ 658 659 ecp_nistz256_scatter_w5 (row, &temp[0], 1); 660 ecp_nistz256_point_double(&temp[1], &temp[0]); /*1+1=2 */ 661 ecp_nistz256_scatter_w5 (row, &temp[1], 2); 662 ecp_nistz256_point_add (&temp[2], &temp[1], &temp[0]); /*2+1=3 */ 663 ecp_nistz256_scatter_w5 (row, &temp[2], 3); 664 ecp_nistz256_point_double(&temp[1], &temp[1]); /*2*2=4 */ 665 ecp_nistz256_scatter_w5 (row, &temp[1], 4); 666 ecp_nistz256_point_double(&temp[2], &temp[2]); /*2*3=6 */ 667 ecp_nistz256_scatter_w5 (row, &temp[2], 6); 668 ecp_nistz256_point_add (&temp[3], &temp[1], &temp[0]); /*4+1=5 */ 669 ecp_nistz256_scatter_w5 (row, &temp[3], 5); 670 ecp_nistz256_point_add (&temp[4], &temp[2], &temp[0]); /*6+1=7 */ 671 ecp_nistz256_scatter_w5 (row, &temp[4], 7); 672 ecp_nistz256_point_double(&temp[1], &temp[1]); /*2*4=8 */ 673 ecp_nistz256_scatter_w5 (row, &temp[1], 8); 674 ecp_nistz256_point_double(&temp[2], &temp[2]); /*2*6=12 */ 675 ecp_nistz256_scatter_w5 (row, &temp[2], 12); 676 ecp_nistz256_point_double(&temp[3], &temp[3]); /*2*5=10 */ 677 ecp_nistz256_scatter_w5 (row, &temp[3], 10); 678 ecp_nistz256_point_double(&temp[4], &temp[4]); /*2*7=14 */ 679 ecp_nistz256_scatter_w5 (row, &temp[4], 14); 680 ecp_nistz256_point_add (&temp[2], &temp[2], &temp[0]); /*12+1=13*/ 681 ecp_nistz256_scatter_w5 (row, &temp[2], 13); 682 ecp_nistz256_point_add (&temp[3], &temp[3], &temp[0]); /*10+1=11*/ 683 ecp_nistz256_scatter_w5 (row, &temp[3], 11); 684 ecp_nistz256_point_add (&temp[4], &temp[4], &temp[0]); /*14+1=15*/ 685 ecp_nistz256_scatter_w5 (row, &temp[4], 15); 686 ecp_nistz256_point_add (&temp[2], &temp[1], &temp[0]); /*8+1=9 */ 687 ecp_nistz256_scatter_w5 (row, &temp[2], 9); 688 ecp_nistz256_point_double(&temp[1], &temp[1]); /*2*8=16 */ 689 ecp_nistz256_scatter_w5 (row, &temp[1], 16); 690 } 691 692 idx = 255; 693 694 wvalue = p_str[0][(idx - 1) / 8]; 695 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 696 697 /* 698 * We gather to temp[0], because we know it's position relative 699 * to table 700 */ 701 ecp_nistz256_gather_w5(&temp[0], table[0], _booth_recode_w5(wvalue) >> 1); 702 memcpy(r, &temp[0], sizeof(temp[0])); 703 704 while (idx >= 5) { 705 for (i = (idx == 255 ? 1 : 0); i < num; i++) { 706 unsigned int off = (idx - 1) / 8; 707 708 wvalue = p_str[i][off] | p_str[i][off + 1] << 8; 709 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 710 711 wvalue = _booth_recode_w5(wvalue); 712 713 ecp_nistz256_gather_w5(&temp[0], table[i], wvalue >> 1); 714 715 ecp_nistz256_neg(temp[1].Y, temp[0].Y); 716 copy_conditional(temp[0].Y, temp[1].Y, (wvalue & 1)); 717 718 ecp_nistz256_point_add(r, r, &temp[0]); 719 } 720 721 idx -= window_size; 722 723 ecp_nistz256_point_double(r, r); 724 ecp_nistz256_point_double(r, r); 725 ecp_nistz256_point_double(r, r); 726 ecp_nistz256_point_double(r, r); 727 ecp_nistz256_point_double(r, r); 728 } 729 730 /* Final window */ 731 for (i = 0; i < num; i++) { 732 wvalue = p_str[i][0]; 733 wvalue = (wvalue << 1) & mask; 734 735 wvalue = _booth_recode_w5(wvalue); 736 737 ecp_nistz256_gather_w5(&temp[0], table[i], wvalue >> 1); 738 739 ecp_nistz256_neg(temp[1].Y, temp[0].Y); 740 copy_conditional(temp[0].Y, temp[1].Y, wvalue & 1); 741 742 ecp_nistz256_point_add(r, r, &temp[0]); 743 } 744 745 ret = 1; 746 err: 747 OPENSSL_free(table_storage); 748 OPENSSL_free(p_str); 749 OPENSSL_free(scalars); 750 return ret; 751 } 752 753 /* Coordinates of G, for which we have precomputed tables */ 754 static const BN_ULONG def_xG[P256_LIMBS] = { 755 TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601), 756 TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6) 757 }; 758 759 static const BN_ULONG def_yG[P256_LIMBS] = { 760 TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c), 761 TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85) 762 }; 763 764 /* 765 * ecp_nistz256_is_affine_G returns one if |generator| is the standard, P-256 766 * generator. 767 */ 768 static int ecp_nistz256_is_affine_G(const EC_POINT *generator) 769 { 770 return (bn_get_top(generator->X) == P256_LIMBS) && 771 (bn_get_top(generator->Y) == P256_LIMBS) && 772 is_equal(bn_get_words(generator->X), def_xG) && 773 is_equal(bn_get_words(generator->Y), def_yG) && 774 is_one(generator->Z); 775 } 776 777 __owur static int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx) 778 { 779 /* 780 * We precompute a table for a Booth encoded exponent (wNAF) based 781 * computation. Each table holds 64 values for safe access, with an 782 * implicit value of infinity at index zero. We use window of size 7, and 783 * therefore require ceil(256/7) = 37 tables. 784 */ 785 const BIGNUM *order; 786 EC_POINT *P = NULL, *T = NULL; 787 const EC_POINT *generator; 788 NISTZ256_PRE_COMP *pre_comp; 789 BN_CTX *new_ctx = NULL; 790 int i, j, k, ret = 0; 791 size_t w; 792 793 PRECOMP256_ROW *preComputedTable = NULL; 794 unsigned char *precomp_storage = NULL; 795 796 /* if there is an old NISTZ256_PRE_COMP object, throw it away */ 797 EC_pre_comp_free(group); 798 generator = EC_GROUP_get0_generator(group); 799 if (generator == NULL) { 800 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNDEFINED_GENERATOR); 801 return 0; 802 } 803 804 if (ecp_nistz256_is_affine_G(generator)) { 805 /* 806 * No need to calculate tables for the standard generator because we 807 * have them statically. 808 */ 809 return 1; 810 } 811 812 if ((pre_comp = ecp_nistz256_pre_comp_new(group)) == NULL) 813 return 0; 814 815 if (ctx == NULL) { 816 ctx = new_ctx = BN_CTX_new(); 817 if (ctx == NULL) 818 goto err; 819 } 820 821 BN_CTX_start(ctx); 822 823 order = EC_GROUP_get0_order(group); 824 if (order == NULL) 825 goto err; 826 827 if (BN_is_zero(order)) { 828 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNKNOWN_ORDER); 829 goto err; 830 } 831 832 w = 7; 833 834 if ((precomp_storage = 835 OPENSSL_malloc(37 * 64 * sizeof(P256_POINT_AFFINE) + 64)) == NULL) { 836 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, ERR_R_MALLOC_FAILURE); 837 goto err; 838 } 839 840 preComputedTable = (void *)ALIGNPTR(precomp_storage, 64); 841 842 P = EC_POINT_new(group); 843 T = EC_POINT_new(group); 844 if (P == NULL || T == NULL) 845 goto err; 846 847 /* 848 * The zero entry is implicitly infinity, and we skip it, storing other 849 * values with -1 offset. 850 */ 851 if (!EC_POINT_copy(T, generator)) 852 goto err; 853 854 for (k = 0; k < 64; k++) { 855 if (!EC_POINT_copy(P, T)) 856 goto err; 857 for (j = 0; j < 37; j++) { 858 P256_POINT_AFFINE temp; 859 /* 860 * It would be faster to use EC_POINTs_make_affine and 861 * make multiple points affine at the same time. 862 */ 863 if (!EC_POINT_make_affine(group, P, ctx)) 864 goto err; 865 if (!ecp_nistz256_bignum_to_field_elem(temp.X, P->X) || 866 !ecp_nistz256_bignum_to_field_elem(temp.Y, P->Y)) { 867 ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, 868 EC_R_COORDINATES_OUT_OF_RANGE); 869 goto err; 870 } 871 ecp_nistz256_scatter_w7(preComputedTable[j], &temp, k); 872 for (i = 0; i < 7; i++) { 873 if (!EC_POINT_dbl(group, P, P, ctx)) 874 goto err; 875 } 876 } 877 if (!EC_POINT_add(group, T, T, generator, ctx)) 878 goto err; 879 } 880 881 pre_comp->group = group; 882 pre_comp->w = w; 883 pre_comp->precomp = preComputedTable; 884 pre_comp->precomp_storage = precomp_storage; 885 precomp_storage = NULL; 886 SETPRECOMP(group, nistz256, pre_comp); 887 pre_comp = NULL; 888 ret = 1; 889 890 err: 891 if (ctx != NULL) 892 BN_CTX_end(ctx); 893 BN_CTX_free(new_ctx); 894 895 EC_nistz256_pre_comp_free(pre_comp); 896 OPENSSL_free(precomp_storage); 897 EC_POINT_free(P); 898 EC_POINT_free(T); 899 return ret; 900 } 901 902 /* 903 * Note that by default ECP_NISTZ256_AVX2 is undefined. While it's great 904 * code processing 4 points in parallel, corresponding serial operation 905 * is several times slower, because it uses 29x29=58-bit multiplication 906 * as opposite to 64x64=128-bit in integer-only scalar case. As result 907 * it doesn't provide *significant* performance improvement. Note that 908 * just defining ECP_NISTZ256_AVX2 is not sufficient to make it work, 909 * you'd need to compile even asm/ecp_nistz256-avx.pl module. 910 */ 911 #if defined(ECP_NISTZ256_AVX2) 912 # if !(defined(__x86_64) || defined(__x86_64__) || \ 913 defined(_M_AMD64) || defined(_M_X64)) || \ 914 !(defined(__GNUC__) || defined(_MSC_VER)) /* this is for ALIGN32 */ 915 # undef ECP_NISTZ256_AVX2 916 # else 917 /* Constant time access, loading four values, from four consecutive tables */ 918 void ecp_nistz256_avx2_multi_gather_w7(void *result, const void *in, 919 int index0, int index1, int index2, 920 int index3); 921 void ecp_nistz256_avx2_transpose_convert(void *RESULTx4, const void *in); 922 void ecp_nistz256_avx2_convert_transpose_back(void *result, const void *Ax4); 923 void ecp_nistz256_avx2_point_add_affine_x4(void *RESULTx4, const void *Ax4, 924 const void *Bx4); 925 void ecp_nistz256_avx2_point_add_affines_x4(void *RESULTx4, const void *Ax4, 926 const void *Bx4); 927 void ecp_nistz256_avx2_to_mont(void *RESULTx4, const void *Ax4); 928 void ecp_nistz256_avx2_from_mont(void *RESULTx4, const void *Ax4); 929 void ecp_nistz256_avx2_set1(void *RESULTx4); 930 int ecp_nistz_avx2_eligible(void); 931 932 static void booth_recode_w7(unsigned char *sign, 933 unsigned char *digit, unsigned char in) 934 { 935 unsigned char s, d; 936 937 s = ~((in >> 7) - 1); 938 d = (1 << 8) - in - 1; 939 d = (d & s) | (in & ~s); 940 d = (d >> 1) + (d & 1); 941 942 *sign = s & 1; 943 *digit = d; 944 } 945 946 /* 947 * ecp_nistz256_avx2_mul_g performs multiplication by G, using only the 948 * precomputed table. It does 4 affine point additions in parallel, 949 * significantly speeding up point multiplication for a fixed value. 950 */ 951 static void ecp_nistz256_avx2_mul_g(P256_POINT *r, 952 unsigned char p_str[33], 953 const P256_POINT_AFFINE(*preComputedTable)[64]) 954 { 955 const unsigned int window_size = 7; 956 const unsigned int mask = (1 << (window_size + 1)) - 1; 957 unsigned int wvalue; 958 /* Using 4 windows at a time */ 959 unsigned char sign0, digit0; 960 unsigned char sign1, digit1; 961 unsigned char sign2, digit2; 962 unsigned char sign3, digit3; 963 unsigned int idx = 0; 964 BN_ULONG tmp[P256_LIMBS]; 965 int i; 966 967 ALIGN32 BN_ULONG aX4[4 * 9 * 3] = { 0 }; 968 ALIGN32 BN_ULONG bX4[4 * 9 * 2] = { 0 }; 969 ALIGN32 P256_POINT_AFFINE point_arr[4]; 970 ALIGN32 P256_POINT res_point_arr[4]; 971 972 /* Initial four windows */ 973 wvalue = *((u16 *) & p_str[0]); 974 wvalue = (wvalue << 1) & mask; 975 idx += window_size; 976 booth_recode_w7(&sign0, &digit0, wvalue); 977 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 978 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 979 idx += window_size; 980 booth_recode_w7(&sign1, &digit1, wvalue); 981 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 982 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 983 idx += window_size; 984 booth_recode_w7(&sign2, &digit2, wvalue); 985 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 986 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 987 idx += window_size; 988 booth_recode_w7(&sign3, &digit3, wvalue); 989 990 ecp_nistz256_avx2_multi_gather_w7(point_arr, preComputedTable[0], 991 digit0, digit1, digit2, digit3); 992 993 ecp_nistz256_neg(tmp, point_arr[0].Y); 994 copy_conditional(point_arr[0].Y, tmp, sign0); 995 ecp_nistz256_neg(tmp, point_arr[1].Y); 996 copy_conditional(point_arr[1].Y, tmp, sign1); 997 ecp_nistz256_neg(tmp, point_arr[2].Y); 998 copy_conditional(point_arr[2].Y, tmp, sign2); 999 ecp_nistz256_neg(tmp, point_arr[3].Y); 1000 copy_conditional(point_arr[3].Y, tmp, sign3); 1001 1002 ecp_nistz256_avx2_transpose_convert(aX4, point_arr); 1003 ecp_nistz256_avx2_to_mont(aX4, aX4); 1004 ecp_nistz256_avx2_to_mont(&aX4[4 * 9], &aX4[4 * 9]); 1005 ecp_nistz256_avx2_set1(&aX4[4 * 9 * 2]); 1006 1007 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1008 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1009 idx += window_size; 1010 booth_recode_w7(&sign0, &digit0, wvalue); 1011 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1012 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1013 idx += window_size; 1014 booth_recode_w7(&sign1, &digit1, wvalue); 1015 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1016 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1017 idx += window_size; 1018 booth_recode_w7(&sign2, &digit2, wvalue); 1019 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1020 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1021 idx += window_size; 1022 booth_recode_w7(&sign3, &digit3, wvalue); 1023 1024 ecp_nistz256_avx2_multi_gather_w7(point_arr, preComputedTable[4 * 1], 1025 digit0, digit1, digit2, digit3); 1026 1027 ecp_nistz256_neg(tmp, point_arr[0].Y); 1028 copy_conditional(point_arr[0].Y, tmp, sign0); 1029 ecp_nistz256_neg(tmp, point_arr[1].Y); 1030 copy_conditional(point_arr[1].Y, tmp, sign1); 1031 ecp_nistz256_neg(tmp, point_arr[2].Y); 1032 copy_conditional(point_arr[2].Y, tmp, sign2); 1033 ecp_nistz256_neg(tmp, point_arr[3].Y); 1034 copy_conditional(point_arr[3].Y, tmp, sign3); 1035 1036 ecp_nistz256_avx2_transpose_convert(bX4, point_arr); 1037 ecp_nistz256_avx2_to_mont(bX4, bX4); 1038 ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]); 1039 /* Optimized when both inputs are affine */ 1040 ecp_nistz256_avx2_point_add_affines_x4(aX4, aX4, bX4); 1041 1042 for (i = 2; i < 9; i++) { 1043 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1044 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1045 idx += window_size; 1046 booth_recode_w7(&sign0, &digit0, wvalue); 1047 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1048 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1049 idx += window_size; 1050 booth_recode_w7(&sign1, &digit1, wvalue); 1051 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1052 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1053 idx += window_size; 1054 booth_recode_w7(&sign2, &digit2, wvalue); 1055 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1056 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1057 idx += window_size; 1058 booth_recode_w7(&sign3, &digit3, wvalue); 1059 1060 ecp_nistz256_avx2_multi_gather_w7(point_arr, 1061 preComputedTable[4 * i], 1062 digit0, digit1, digit2, digit3); 1063 1064 ecp_nistz256_neg(tmp, point_arr[0].Y); 1065 copy_conditional(point_arr[0].Y, tmp, sign0); 1066 ecp_nistz256_neg(tmp, point_arr[1].Y); 1067 copy_conditional(point_arr[1].Y, tmp, sign1); 1068 ecp_nistz256_neg(tmp, point_arr[2].Y); 1069 copy_conditional(point_arr[2].Y, tmp, sign2); 1070 ecp_nistz256_neg(tmp, point_arr[3].Y); 1071 copy_conditional(point_arr[3].Y, tmp, sign3); 1072 1073 ecp_nistz256_avx2_transpose_convert(bX4, point_arr); 1074 ecp_nistz256_avx2_to_mont(bX4, bX4); 1075 ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]); 1076 1077 ecp_nistz256_avx2_point_add_affine_x4(aX4, aX4, bX4); 1078 } 1079 1080 ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 0], &aX4[4 * 9 * 0]); 1081 ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 1], &aX4[4 * 9 * 1]); 1082 ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 2], &aX4[4 * 9 * 2]); 1083 1084 ecp_nistz256_avx2_convert_transpose_back(res_point_arr, aX4); 1085 /* Last window is performed serially */ 1086 wvalue = *((u16 *) & p_str[(idx - 1) / 8]); 1087 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1088 booth_recode_w7(&sign0, &digit0, wvalue); 1089 ecp_nistz256_gather_w7((P256_POINT_AFFINE *)r, 1090 preComputedTable[36], digit0); 1091 ecp_nistz256_neg(tmp, r->Y); 1092 copy_conditional(r->Y, tmp, sign0); 1093 memcpy(r->Z, ONE, sizeof(ONE)); 1094 /* Sum the four windows */ 1095 ecp_nistz256_point_add(r, r, &res_point_arr[0]); 1096 ecp_nistz256_point_add(r, r, &res_point_arr[1]); 1097 ecp_nistz256_point_add(r, r, &res_point_arr[2]); 1098 ecp_nistz256_point_add(r, r, &res_point_arr[3]); 1099 } 1100 # endif 1101 #endif 1102 1103 __owur static int ecp_nistz256_set_from_affine(EC_POINT *out, const EC_GROUP *group, 1104 const P256_POINT_AFFINE *in, 1105 BN_CTX *ctx) 1106 { 1107 int ret = 0; 1108 1109 if ((ret = bn_set_words(out->X, in->X, P256_LIMBS)) 1110 && (ret = bn_set_words(out->Y, in->Y, P256_LIMBS)) 1111 && (ret = bn_set_words(out->Z, ONE, P256_LIMBS))) 1112 out->Z_is_one = 1; 1113 1114 return ret; 1115 } 1116 1117 /* r = scalar*G + sum(scalars[i]*points[i]) */ 1118 __owur static int ecp_nistz256_points_mul(const EC_GROUP *group, 1119 EC_POINT *r, 1120 const BIGNUM *scalar, 1121 size_t num, 1122 const EC_POINT *points[], 1123 const BIGNUM *scalars[], BN_CTX *ctx) 1124 { 1125 int i = 0, ret = 0, no_precomp_for_generator = 0, p_is_infinity = 0; 1126 unsigned char p_str[33] = { 0 }; 1127 const PRECOMP256_ROW *preComputedTable = NULL; 1128 const NISTZ256_PRE_COMP *pre_comp = NULL; 1129 const EC_POINT *generator = NULL; 1130 const BIGNUM **new_scalars = NULL; 1131 const EC_POINT **new_points = NULL; 1132 unsigned int idx = 0; 1133 const unsigned int window_size = 7; 1134 const unsigned int mask = (1 << (window_size + 1)) - 1; 1135 unsigned int wvalue; 1136 ALIGN32 union { 1137 P256_POINT p; 1138 P256_POINT_AFFINE a; 1139 } t, p; 1140 BIGNUM *tmp_scalar; 1141 1142 if ((num + 1) == 0 || (num + 1) > OPENSSL_MALLOC_MAX_NELEMS(void *)) { 1143 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 1144 return 0; 1145 } 1146 1147 BN_CTX_start(ctx); 1148 1149 if (scalar) { 1150 generator = EC_GROUP_get0_generator(group); 1151 if (generator == NULL) { 1152 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_UNDEFINED_GENERATOR); 1153 goto err; 1154 } 1155 1156 /* look if we can use precomputed multiples of generator */ 1157 pre_comp = group->pre_comp.nistz256; 1158 1159 if (pre_comp) { 1160 /* 1161 * If there is a precomputed table for the generator, check that 1162 * it was generated with the same generator. 1163 */ 1164 EC_POINT *pre_comp_generator = EC_POINT_new(group); 1165 if (pre_comp_generator == NULL) 1166 goto err; 1167 1168 ecp_nistz256_gather_w7(&p.a, pre_comp->precomp[0], 1); 1169 if (!ecp_nistz256_set_from_affine(pre_comp_generator, 1170 group, &p.a, ctx)) { 1171 EC_POINT_free(pre_comp_generator); 1172 goto err; 1173 } 1174 1175 if (0 == EC_POINT_cmp(group, generator, pre_comp_generator, ctx)) 1176 preComputedTable = (const PRECOMP256_ROW *)pre_comp->precomp; 1177 1178 EC_POINT_free(pre_comp_generator); 1179 } 1180 1181 if (preComputedTable == NULL && ecp_nistz256_is_affine_G(generator)) { 1182 /* 1183 * If there is no precomputed data, but the generator is the 1184 * default, a hardcoded table of precomputed data is used. This 1185 * is because applications, such as Apache, do not use 1186 * EC_KEY_precompute_mult. 1187 */ 1188 preComputedTable = ecp_nistz256_precomputed; 1189 } 1190 1191 if (preComputedTable) { 1192 if ((BN_num_bits(scalar) > 256) 1193 || BN_is_negative(scalar)) { 1194 if ((tmp_scalar = BN_CTX_get(ctx)) == NULL) 1195 goto err; 1196 1197 if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) { 1198 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_BN_LIB); 1199 goto err; 1200 } 1201 scalar = tmp_scalar; 1202 } 1203 1204 for (i = 0; i < bn_get_top(scalar) * BN_BYTES; i += BN_BYTES) { 1205 BN_ULONG d = bn_get_words(scalar)[i / BN_BYTES]; 1206 1207 p_str[i + 0] = (unsigned char)d; 1208 p_str[i + 1] = (unsigned char)(d >> 8); 1209 p_str[i + 2] = (unsigned char)(d >> 16); 1210 p_str[i + 3] = (unsigned char)(d >>= 24); 1211 if (BN_BYTES == 8) { 1212 d >>= 8; 1213 p_str[i + 4] = (unsigned char)d; 1214 p_str[i + 5] = (unsigned char)(d >> 8); 1215 p_str[i + 6] = (unsigned char)(d >> 16); 1216 p_str[i + 7] = (unsigned char)(d >> 24); 1217 } 1218 } 1219 1220 for (; i < 33; i++) 1221 p_str[i] = 0; 1222 1223 #if defined(ECP_NISTZ256_AVX2) 1224 if (ecp_nistz_avx2_eligible()) { 1225 ecp_nistz256_avx2_mul_g(&p.p, p_str, preComputedTable); 1226 } else 1227 #endif 1228 { 1229 BN_ULONG infty; 1230 1231 /* First window */ 1232 wvalue = (p_str[0] << 1) & mask; 1233 idx += window_size; 1234 1235 wvalue = _booth_recode_w7(wvalue); 1236 1237 ecp_nistz256_gather_w7(&p.a, preComputedTable[0], 1238 wvalue >> 1); 1239 1240 ecp_nistz256_neg(p.p.Z, p.p.Y); 1241 copy_conditional(p.p.Y, p.p.Z, wvalue & 1); 1242 1243 /* 1244 * Since affine infinity is encoded as (0,0) and 1245 * Jacobian ias (,,0), we need to harmonize them 1246 * by assigning "one" or zero to Z. 1247 */ 1248 infty = (p.p.X[0] | p.p.X[1] | p.p.X[2] | p.p.X[3] | 1249 p.p.Y[0] | p.p.Y[1] | p.p.Y[2] | p.p.Y[3]); 1250 if (P256_LIMBS == 8) 1251 infty |= (p.p.X[4] | p.p.X[5] | p.p.X[6] | p.p.X[7] | 1252 p.p.Y[4] | p.p.Y[5] | p.p.Y[6] | p.p.Y[7]); 1253 1254 infty = 0 - is_zero(infty); 1255 infty = ~infty; 1256 1257 p.p.Z[0] = ONE[0] & infty; 1258 p.p.Z[1] = ONE[1] & infty; 1259 p.p.Z[2] = ONE[2] & infty; 1260 p.p.Z[3] = ONE[3] & infty; 1261 if (P256_LIMBS == 8) { 1262 p.p.Z[4] = ONE[4] & infty; 1263 p.p.Z[5] = ONE[5] & infty; 1264 p.p.Z[6] = ONE[6] & infty; 1265 p.p.Z[7] = ONE[7] & infty; 1266 } 1267 1268 for (i = 1; i < 37; i++) { 1269 unsigned int off = (idx - 1) / 8; 1270 wvalue = p_str[off] | p_str[off + 1] << 8; 1271 wvalue = (wvalue >> ((idx - 1) % 8)) & mask; 1272 idx += window_size; 1273 1274 wvalue = _booth_recode_w7(wvalue); 1275 1276 ecp_nistz256_gather_w7(&t.a, 1277 preComputedTable[i], wvalue >> 1); 1278 1279 ecp_nistz256_neg(t.p.Z, t.a.Y); 1280 copy_conditional(t.a.Y, t.p.Z, wvalue & 1); 1281 1282 ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a); 1283 } 1284 } 1285 } else { 1286 p_is_infinity = 1; 1287 no_precomp_for_generator = 1; 1288 } 1289 } else 1290 p_is_infinity = 1; 1291 1292 if (no_precomp_for_generator) { 1293 /* 1294 * Without a precomputed table for the generator, it has to be 1295 * handled like a normal point. 1296 */ 1297 new_scalars = OPENSSL_malloc((num + 1) * sizeof(BIGNUM *)); 1298 if (new_scalars == NULL) { 1299 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 1300 goto err; 1301 } 1302 1303 new_points = OPENSSL_malloc((num + 1) * sizeof(EC_POINT *)); 1304 if (new_points == NULL) { 1305 ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE); 1306 goto err; 1307 } 1308 1309 memcpy(new_scalars, scalars, num * sizeof(BIGNUM *)); 1310 new_scalars[num] = scalar; 1311 memcpy(new_points, points, num * sizeof(EC_POINT *)); 1312 new_points[num] = generator; 1313 1314 scalars = new_scalars; 1315 points = new_points; 1316 num++; 1317 } 1318 1319 if (num) { 1320 P256_POINT *out = &t.p; 1321 if (p_is_infinity) 1322 out = &p.p; 1323 1324 if (!ecp_nistz256_windowed_mul(group, out, scalars, points, num, ctx)) 1325 goto err; 1326 1327 if (!p_is_infinity) 1328 ecp_nistz256_point_add(&p.p, &p.p, out); 1329 } 1330 1331 /* Not constant-time, but we're only operating on the public output. */ 1332 if (!bn_set_words(r->X, p.p.X, P256_LIMBS) || 1333 !bn_set_words(r->Y, p.p.Y, P256_LIMBS) || 1334 !bn_set_words(r->Z, p.p.Z, P256_LIMBS)) { 1335 goto err; 1336 } 1337 r->Z_is_one = is_one(r->Z) & 1; 1338 1339 ret = 1; 1340 1341 err: 1342 BN_CTX_end(ctx); 1343 OPENSSL_free(new_points); 1344 OPENSSL_free(new_scalars); 1345 return ret; 1346 } 1347 1348 __owur static int ecp_nistz256_get_affine(const EC_GROUP *group, 1349 const EC_POINT *point, 1350 BIGNUM *x, BIGNUM *y, BN_CTX *ctx) 1351 { 1352 BN_ULONG z_inv2[P256_LIMBS]; 1353 BN_ULONG z_inv3[P256_LIMBS]; 1354 BN_ULONG x_aff[P256_LIMBS]; 1355 BN_ULONG y_aff[P256_LIMBS]; 1356 BN_ULONG point_x[P256_LIMBS], point_y[P256_LIMBS], point_z[P256_LIMBS]; 1357 BN_ULONG x_ret[P256_LIMBS], y_ret[P256_LIMBS]; 1358 1359 if (EC_POINT_is_at_infinity(group, point)) { 1360 ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_POINT_AT_INFINITY); 1361 return 0; 1362 } 1363 1364 if (!ecp_nistz256_bignum_to_field_elem(point_x, point->X) || 1365 !ecp_nistz256_bignum_to_field_elem(point_y, point->Y) || 1366 !ecp_nistz256_bignum_to_field_elem(point_z, point->Z)) { 1367 ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_COORDINATES_OUT_OF_RANGE); 1368 return 0; 1369 } 1370 1371 ecp_nistz256_mod_inverse(z_inv3, point_z); 1372 ecp_nistz256_sqr_mont(z_inv2, z_inv3); 1373 ecp_nistz256_mul_mont(x_aff, z_inv2, point_x); 1374 1375 if (x != NULL) { 1376 ecp_nistz256_from_mont(x_ret, x_aff); 1377 if (!bn_set_words(x, x_ret, P256_LIMBS)) 1378 return 0; 1379 } 1380 1381 if (y != NULL) { 1382 ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2); 1383 ecp_nistz256_mul_mont(y_aff, z_inv3, point_y); 1384 ecp_nistz256_from_mont(y_ret, y_aff); 1385 if (!bn_set_words(y, y_ret, P256_LIMBS)) 1386 return 0; 1387 } 1388 1389 return 1; 1390 } 1391 1392 static NISTZ256_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group) 1393 { 1394 NISTZ256_PRE_COMP *ret = NULL; 1395 1396 if (!group) 1397 return NULL; 1398 1399 ret = OPENSSL_zalloc(sizeof(*ret)); 1400 1401 if (ret == NULL) { 1402 ECerr(EC_F_ECP_NISTZ256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); 1403 return ret; 1404 } 1405 1406 ret->group = group; 1407 ret->w = 6; /* default */ 1408 ret->references = 1; 1409 1410 ret->lock = CRYPTO_THREAD_lock_new(); 1411 if (ret->lock == NULL) { 1412 ECerr(EC_F_ECP_NISTZ256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); 1413 OPENSSL_free(ret); 1414 return NULL; 1415 } 1416 return ret; 1417 } 1418 1419 NISTZ256_PRE_COMP *EC_nistz256_pre_comp_dup(NISTZ256_PRE_COMP *p) 1420 { 1421 int i; 1422 if (p != NULL) 1423 CRYPTO_UP_REF(&p->references, &i, p->lock); 1424 return p; 1425 } 1426 1427 void EC_nistz256_pre_comp_free(NISTZ256_PRE_COMP *pre) 1428 { 1429 int i; 1430 1431 if (pre == NULL) 1432 return; 1433 1434 CRYPTO_DOWN_REF(&pre->references, &i, pre->lock); 1435 REF_PRINT_COUNT("EC_nistz256", x); 1436 if (i > 0) 1437 return; 1438 REF_ASSERT_ISNT(i < 0); 1439 1440 OPENSSL_free(pre->precomp_storage); 1441 CRYPTO_THREAD_lock_free(pre->lock); 1442 OPENSSL_free(pre); 1443 } 1444 1445 1446 static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group) 1447 { 1448 /* There is a hard-coded table for the default generator. */ 1449 const EC_POINT *generator = EC_GROUP_get0_generator(group); 1450 1451 if (generator != NULL && ecp_nistz256_is_affine_G(generator)) { 1452 /* There is a hard-coded table for the default generator. */ 1453 return 1; 1454 } 1455 1456 return HAVEPRECOMP(group, nistz256); 1457 } 1458 1459 #if defined(__x86_64) || defined(__x86_64__) || \ 1460 defined(_M_AMD64) || defined(_M_X64) || \ 1461 defined(__powerpc64__) || defined(_ARCH_PP64) || \ 1462 defined(__aarch64__) 1463 /* 1464 * Montgomery mul modulo Order(P): res = a*b*2^-256 mod Order(P) 1465 */ 1466 void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS], 1467 const BN_ULONG a[P256_LIMBS], 1468 const BN_ULONG b[P256_LIMBS]); 1469 void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS], 1470 const BN_ULONG a[P256_LIMBS], 1471 int rep); 1472 1473 static int ecp_nistz256_inv_mod_ord(const EC_GROUP *group, BIGNUM *r, 1474 const BIGNUM *x, BN_CTX *ctx) 1475 { 1476 /* RR = 2^512 mod ord(p256) */ 1477 static const BN_ULONG RR[P256_LIMBS] = { 1478 TOBN(0x83244c95,0xbe79eea2), TOBN(0x4699799c,0x49bd6fa6), 1479 TOBN(0x2845b239,0x2b6bec59), TOBN(0x66e12d94,0xf3d95620) 1480 }; 1481 /* The constant 1 (unlike ONE that is one in Montgomery representation) */ 1482 static const BN_ULONG one[P256_LIMBS] = { 1483 TOBN(0,1), TOBN(0,0), TOBN(0,0), TOBN(0,0) 1484 }; 1485 /* 1486 * We don't use entry 0 in the table, so we omit it and address 1487 * with -1 offset. 1488 */ 1489 BN_ULONG table[15][P256_LIMBS]; 1490 BN_ULONG out[P256_LIMBS], t[P256_LIMBS]; 1491 int i, ret = 0; 1492 enum { 1493 i_1 = 0, i_10, i_11, i_101, i_111, i_1010, i_1111, 1494 i_10101, i_101010, i_101111, i_x6, i_x8, i_x16, i_x32 1495 }; 1496 1497 /* 1498 * Catch allocation failure early. 1499 */ 1500 if (bn_wexpand(r, P256_LIMBS) == NULL) { 1501 ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB); 1502 goto err; 1503 } 1504 1505 if ((BN_num_bits(x) > 256) || BN_is_negative(x)) { 1506 BIGNUM *tmp; 1507 1508 if ((tmp = BN_CTX_get(ctx)) == NULL 1509 || !BN_nnmod(tmp, x, group->order, ctx)) { 1510 ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB); 1511 goto err; 1512 } 1513 x = tmp; 1514 } 1515 1516 if (!ecp_nistz256_bignum_to_field_elem(t, x)) { 1517 ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, EC_R_COORDINATES_OUT_OF_RANGE); 1518 goto err; 1519 } 1520 1521 ecp_nistz256_ord_mul_mont(table[0], t, RR); 1522 #if 0 1523 /* 1524 * Original sparse-then-fixed-window algorithm, retained for reference. 1525 */ 1526 for (i = 2; i < 16; i += 2) { 1527 ecp_nistz256_ord_sqr_mont(table[i-1], table[i/2-1], 1); 1528 ecp_nistz256_ord_mul_mont(table[i], table[i-1], table[0]); 1529 } 1530 1531 /* 1532 * The top 128bit of the exponent are highly redudndant, so we 1533 * perform an optimized flow 1534 */ 1535 ecp_nistz256_ord_sqr_mont(t, table[15-1], 4); /* f0 */ 1536 ecp_nistz256_ord_mul_mont(t, t, table[15-1]); /* ff */ 1537 1538 ecp_nistz256_ord_sqr_mont(out, t, 8); /* ff00 */ 1539 ecp_nistz256_ord_mul_mont(out, out, t); /* ffff */ 1540 1541 ecp_nistz256_ord_sqr_mont(t, out, 16); /* ffff0000 */ 1542 ecp_nistz256_ord_mul_mont(t, t, out); /* ffffffff */ 1543 1544 ecp_nistz256_ord_sqr_mont(out, t, 64); /* ffffffff0000000000000000 */ 1545 ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffff */ 1546 1547 ecp_nistz256_ord_sqr_mont(out, out, 32); /* ffffffff00000000ffffffff00000000 */ 1548 ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffffffffffff */ 1549 1550 /* 1551 * The bottom 128 bit of the exponent are processed with fixed 4-bit window 1552 */ 1553 for(i = 0; i < 32; i++) { 1554 /* expLo - the low 128 bits of the exponent we use (ord(p256) - 2), 1555 * split into nibbles */ 1556 static const unsigned char expLo[32] = { 1557 0xb,0xc,0xe,0x6,0xf,0xa,0xa,0xd,0xa,0x7,0x1,0x7,0x9,0xe,0x8,0x4, 1558 0xf,0x3,0xb,0x9,0xc,0xa,0xc,0x2,0xf,0xc,0x6,0x3,0x2,0x5,0x4,0xf 1559 }; 1560 1561 ecp_nistz256_ord_sqr_mont(out, out, 4); 1562 /* The exponent is public, no need in constant-time access */ 1563 ecp_nistz256_ord_mul_mont(out, out, table[expLo[i]-1]); 1564 } 1565 #else 1566 /* 1567 * https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion 1568 * 1569 * Even though this code path spares 12 squarings, 4.5%, and 13 1570 * multiplications, 25%, on grand scale sign operation is not that 1571 * much faster, not more that 2%... 1572 */ 1573 1574 /* pre-calculate powers */ 1575 ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], 1); 1576 1577 ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]); 1578 1579 ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]); 1580 1581 ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]); 1582 1583 ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], 1); 1584 1585 ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]); 1586 1587 ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], 1); 1588 ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]); 1589 1590 ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], 1); 1591 1592 ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]); 1593 1594 ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]); 1595 1596 ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], 2); 1597 ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]); 1598 1599 ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], 8); 1600 ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]); 1601 1602 ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], 16); 1603 ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]); 1604 1605 /* calculations */ 1606 ecp_nistz256_ord_sqr_mont(out, table[i_x32], 64); 1607 ecp_nistz256_ord_mul_mont(out, out, table[i_x32]); 1608 1609 for (i = 0; i < 27; i++) { 1610 static const struct { unsigned char p, i; } chain[27] = { 1611 { 32, i_x32 }, { 6, i_101111 }, { 5, i_111 }, 1612 { 4, i_11 }, { 5, i_1111 }, { 5, i_10101 }, 1613 { 4, i_101 }, { 3, i_101 }, { 3, i_101 }, 1614 { 5, i_111 }, { 9, i_101111 }, { 6, i_1111 }, 1615 { 2, i_1 }, { 5, i_1 }, { 6, i_1111 }, 1616 { 5, i_111 }, { 4, i_111 }, { 5, i_111 }, 1617 { 5, i_101 }, { 3, i_11 }, { 10, i_101111 }, 1618 { 2, i_11 }, { 5, i_11 }, { 5, i_11 }, 1619 { 3, i_1 }, { 7, i_10101 }, { 6, i_1111 } 1620 }; 1621 1622 ecp_nistz256_ord_sqr_mont(out, out, chain[i].p); 1623 ecp_nistz256_ord_mul_mont(out, out, table[chain[i].i]); 1624 } 1625 #endif 1626 ecp_nistz256_ord_mul_mont(out, out, one); 1627 1628 /* 1629 * Can't fail, but check return code to be consistent anyway. 1630 */ 1631 if (!bn_set_words(r, out, P256_LIMBS)) 1632 goto err; 1633 1634 ret = 1; 1635 err: 1636 return ret; 1637 } 1638 #else 1639 # define ecp_nistz256_inv_mod_ord NULL 1640 #endif 1641 1642 const EC_METHOD *EC_GFp_nistz256_method(void) 1643 { 1644 static const EC_METHOD ret = { 1645 EC_FLAGS_DEFAULT_OCT, 1646 NID_X9_62_prime_field, 1647 ec_GFp_mont_group_init, 1648 ec_GFp_mont_group_finish, 1649 ec_GFp_mont_group_clear_finish, 1650 ec_GFp_mont_group_copy, 1651 ec_GFp_mont_group_set_curve, 1652 ec_GFp_simple_group_get_curve, 1653 ec_GFp_simple_group_get_degree, 1654 ec_group_simple_order_bits, 1655 ec_GFp_simple_group_check_discriminant, 1656 ec_GFp_simple_point_init, 1657 ec_GFp_simple_point_finish, 1658 ec_GFp_simple_point_clear_finish, 1659 ec_GFp_simple_point_copy, 1660 ec_GFp_simple_point_set_to_infinity, 1661 ec_GFp_simple_set_Jprojective_coordinates_GFp, 1662 ec_GFp_simple_get_Jprojective_coordinates_GFp, 1663 ec_GFp_simple_point_set_affine_coordinates, 1664 ecp_nistz256_get_affine, 1665 0, 0, 0, 1666 ec_GFp_simple_add, 1667 ec_GFp_simple_dbl, 1668 ec_GFp_simple_invert, 1669 ec_GFp_simple_is_at_infinity, 1670 ec_GFp_simple_is_on_curve, 1671 ec_GFp_simple_cmp, 1672 ec_GFp_simple_make_affine, 1673 ec_GFp_simple_points_make_affine, 1674 ecp_nistz256_points_mul, /* mul */ 1675 ecp_nistz256_mult_precompute, /* precompute_mult */ 1676 ecp_nistz256_window_have_precompute_mult, /* have_precompute_mult */ 1677 ec_GFp_mont_field_mul, 1678 ec_GFp_mont_field_sqr, 1679 0, /* field_div */ 1680 ec_GFp_mont_field_encode, 1681 ec_GFp_mont_field_decode, 1682 ec_GFp_mont_field_set_to_one, 1683 ec_key_simple_priv2oct, 1684 ec_key_simple_oct2priv, 1685 0, /* set private */ 1686 ec_key_simple_generate_key, 1687 ec_key_simple_check_key, 1688 ec_key_simple_generate_public_key, 1689 0, /* keycopy */ 1690 0, /* keyfinish */ 1691 ecdh_simple_compute_key, 1692 ecp_nistz256_inv_mod_ord, /* can be #define-d NULL */ 1693 0, /* blind_coordinates */ 1694 0, /* ladder_pre */ 1695 0, /* ladder_step */ 1696 0 /* ladder_post */ 1697 }; 1698 1699 return &ret; 1700 } 1701