1 /* 2 * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. 3 * 4 * Licensed under the OpenSSL license (the "License"). You may not use 5 * this file except in compliance with the License. You can obtain a copy 6 * in the file LICENSE in the source distribution or at 7 * https://www.openssl.org/source/license.html 8 */ 9 10 #include <assert.h> 11 #include <openssl/crypto.h> 12 #include "internal/cryptlib.h" 13 #include "bn_lcl.h" 14 15 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) 16 17 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 18 BN_ULONG w) 19 { 20 BN_ULONG c1 = 0; 21 22 assert(num >= 0); 23 if (num <= 0) 24 return c1; 25 26 # ifndef OPENSSL_SMALL_FOOTPRINT 27 while (num & ~3) { 28 mul_add(rp[0], ap[0], w, c1); 29 mul_add(rp[1], ap[1], w, c1); 30 mul_add(rp[2], ap[2], w, c1); 31 mul_add(rp[3], ap[3], w, c1); 32 ap += 4; 33 rp += 4; 34 num -= 4; 35 } 36 # endif 37 while (num) { 38 mul_add(rp[0], ap[0], w, c1); 39 ap++; 40 rp++; 41 num--; 42 } 43 44 return c1; 45 } 46 47 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 48 { 49 BN_ULONG c1 = 0; 50 51 assert(num >= 0); 52 if (num <= 0) 53 return c1; 54 55 # ifndef OPENSSL_SMALL_FOOTPRINT 56 while (num & ~3) { 57 mul(rp[0], ap[0], w, c1); 58 mul(rp[1], ap[1], w, c1); 59 mul(rp[2], ap[2], w, c1); 60 mul(rp[3], ap[3], w, c1); 61 ap += 4; 62 rp += 4; 63 num -= 4; 64 } 65 # endif 66 while (num) { 67 mul(rp[0], ap[0], w, c1); 68 ap++; 69 rp++; 70 num--; 71 } 72 return c1; 73 } 74 75 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 76 { 77 assert(n >= 0); 78 if (n <= 0) 79 return; 80 81 # ifndef OPENSSL_SMALL_FOOTPRINT 82 while (n & ~3) { 83 sqr(r[0], r[1], a[0]); 84 sqr(r[2], r[3], a[1]); 85 sqr(r[4], r[5], a[2]); 86 sqr(r[6], r[7], a[3]); 87 a += 4; 88 r += 8; 89 n -= 4; 90 } 91 # endif 92 while (n) { 93 sqr(r[0], r[1], a[0]); 94 a++; 95 r += 2; 96 n--; 97 } 98 } 99 100 #else /* !(defined(BN_LLONG) || 101 * defined(BN_UMULT_HIGH)) */ 102 103 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 104 BN_ULONG w) 105 { 106 BN_ULONG c = 0; 107 BN_ULONG bl, bh; 108 109 assert(num >= 0); 110 if (num <= 0) 111 return (BN_ULONG)0; 112 113 bl = LBITS(w); 114 bh = HBITS(w); 115 116 # ifndef OPENSSL_SMALL_FOOTPRINT 117 while (num & ~3) { 118 mul_add(rp[0], ap[0], bl, bh, c); 119 mul_add(rp[1], ap[1], bl, bh, c); 120 mul_add(rp[2], ap[2], bl, bh, c); 121 mul_add(rp[3], ap[3], bl, bh, c); 122 ap += 4; 123 rp += 4; 124 num -= 4; 125 } 126 # endif 127 while (num) { 128 mul_add(rp[0], ap[0], bl, bh, c); 129 ap++; 130 rp++; 131 num--; 132 } 133 return c; 134 } 135 136 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 137 { 138 BN_ULONG carry = 0; 139 BN_ULONG bl, bh; 140 141 assert(num >= 0); 142 if (num <= 0) 143 return (BN_ULONG)0; 144 145 bl = LBITS(w); 146 bh = HBITS(w); 147 148 # ifndef OPENSSL_SMALL_FOOTPRINT 149 while (num & ~3) { 150 mul(rp[0], ap[0], bl, bh, carry); 151 mul(rp[1], ap[1], bl, bh, carry); 152 mul(rp[2], ap[2], bl, bh, carry); 153 mul(rp[3], ap[3], bl, bh, carry); 154 ap += 4; 155 rp += 4; 156 num -= 4; 157 } 158 # endif 159 while (num) { 160 mul(rp[0], ap[0], bl, bh, carry); 161 ap++; 162 rp++; 163 num--; 164 } 165 return carry; 166 } 167 168 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 169 { 170 assert(n >= 0); 171 if (n <= 0) 172 return; 173 174 # ifndef OPENSSL_SMALL_FOOTPRINT 175 while (n & ~3) { 176 sqr64(r[0], r[1], a[0]); 177 sqr64(r[2], r[3], a[1]); 178 sqr64(r[4], r[5], a[2]); 179 sqr64(r[6], r[7], a[3]); 180 a += 4; 181 r += 8; 182 n -= 4; 183 } 184 # endif 185 while (n) { 186 sqr64(r[0], r[1], a[0]); 187 a++; 188 r += 2; 189 n--; 190 } 191 } 192 193 #endif /* !(defined(BN_LLONG) || 194 * defined(BN_UMULT_HIGH)) */ 195 196 #if defined(BN_LLONG) && defined(BN_DIV2W) 197 198 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 199 { 200 return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d)); 201 } 202 203 #else 204 205 /* Divide h,l by d and return the result. */ 206 /* I need to test this some more :-( */ 207 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 208 { 209 BN_ULONG dh, dl, q, ret = 0, th, tl, t; 210 int i, count = 2; 211 212 if (d == 0) 213 return BN_MASK2; 214 215 i = BN_num_bits_word(d); 216 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); 217 218 i = BN_BITS2 - i; 219 if (h >= d) 220 h -= d; 221 222 if (i) { 223 d <<= i; 224 h = (h << i) | (l >> (BN_BITS2 - i)); 225 l <<= i; 226 } 227 dh = (d & BN_MASK2h) >> BN_BITS4; 228 dl = (d & BN_MASK2l); 229 for (;;) { 230 if ((h >> BN_BITS4) == dh) 231 q = BN_MASK2l; 232 else 233 q = h / dh; 234 235 th = q * dh; 236 tl = dl * q; 237 for (;;) { 238 t = h - th; 239 if ((t & BN_MASK2h) || 240 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) 241 break; 242 q--; 243 th -= dh; 244 tl -= dl; 245 } 246 t = (tl >> BN_BITS4); 247 tl = (tl << BN_BITS4) & BN_MASK2h; 248 th += t; 249 250 if (l < tl) 251 th++; 252 l -= tl; 253 if (h < th) { 254 h += d; 255 q--; 256 } 257 h -= th; 258 259 if (--count == 0) 260 break; 261 262 ret = q << BN_BITS4; 263 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; 264 l = (l & BN_MASK2l) << BN_BITS4; 265 } 266 ret |= q; 267 return ret; 268 } 269 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ 270 271 #ifdef BN_LLONG 272 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 273 int n) 274 { 275 BN_ULLONG ll = 0; 276 277 assert(n >= 0); 278 if (n <= 0) 279 return (BN_ULONG)0; 280 281 # ifndef OPENSSL_SMALL_FOOTPRINT 282 while (n & ~3) { 283 ll += (BN_ULLONG) a[0] + b[0]; 284 r[0] = (BN_ULONG)ll & BN_MASK2; 285 ll >>= BN_BITS2; 286 ll += (BN_ULLONG) a[1] + b[1]; 287 r[1] = (BN_ULONG)ll & BN_MASK2; 288 ll >>= BN_BITS2; 289 ll += (BN_ULLONG) a[2] + b[2]; 290 r[2] = (BN_ULONG)ll & BN_MASK2; 291 ll >>= BN_BITS2; 292 ll += (BN_ULLONG) a[3] + b[3]; 293 r[3] = (BN_ULONG)ll & BN_MASK2; 294 ll >>= BN_BITS2; 295 a += 4; 296 b += 4; 297 r += 4; 298 n -= 4; 299 } 300 # endif 301 while (n) { 302 ll += (BN_ULLONG) a[0] + b[0]; 303 r[0] = (BN_ULONG)ll & BN_MASK2; 304 ll >>= BN_BITS2; 305 a++; 306 b++; 307 r++; 308 n--; 309 } 310 return (BN_ULONG)ll; 311 } 312 #else /* !BN_LLONG */ 313 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 314 int n) 315 { 316 BN_ULONG c, l, t; 317 318 assert(n >= 0); 319 if (n <= 0) 320 return (BN_ULONG)0; 321 322 c = 0; 323 # ifndef OPENSSL_SMALL_FOOTPRINT 324 while (n & ~3) { 325 t = a[0]; 326 t = (t + c) & BN_MASK2; 327 c = (t < c); 328 l = (t + b[0]) & BN_MASK2; 329 c += (l < t); 330 r[0] = l; 331 t = a[1]; 332 t = (t + c) & BN_MASK2; 333 c = (t < c); 334 l = (t + b[1]) & BN_MASK2; 335 c += (l < t); 336 r[1] = l; 337 t = a[2]; 338 t = (t + c) & BN_MASK2; 339 c = (t < c); 340 l = (t + b[2]) & BN_MASK2; 341 c += (l < t); 342 r[2] = l; 343 t = a[3]; 344 t = (t + c) & BN_MASK2; 345 c = (t < c); 346 l = (t + b[3]) & BN_MASK2; 347 c += (l < t); 348 r[3] = l; 349 a += 4; 350 b += 4; 351 r += 4; 352 n -= 4; 353 } 354 # endif 355 while (n) { 356 t = a[0]; 357 t = (t + c) & BN_MASK2; 358 c = (t < c); 359 l = (t + b[0]) & BN_MASK2; 360 c += (l < t); 361 r[0] = l; 362 a++; 363 b++; 364 r++; 365 n--; 366 } 367 return (BN_ULONG)c; 368 } 369 #endif /* !BN_LLONG */ 370 371 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 372 int n) 373 { 374 BN_ULONG t1, t2; 375 int c = 0; 376 377 assert(n >= 0); 378 if (n <= 0) 379 return (BN_ULONG)0; 380 381 #ifndef OPENSSL_SMALL_FOOTPRINT 382 while (n & ~3) { 383 t1 = a[0]; 384 t2 = b[0]; 385 r[0] = (t1 - t2 - c) & BN_MASK2; 386 if (t1 != t2) 387 c = (t1 < t2); 388 t1 = a[1]; 389 t2 = b[1]; 390 r[1] = (t1 - t2 - c) & BN_MASK2; 391 if (t1 != t2) 392 c = (t1 < t2); 393 t1 = a[2]; 394 t2 = b[2]; 395 r[2] = (t1 - t2 - c) & BN_MASK2; 396 if (t1 != t2) 397 c = (t1 < t2); 398 t1 = a[3]; 399 t2 = b[3]; 400 r[3] = (t1 - t2 - c) & BN_MASK2; 401 if (t1 != t2) 402 c = (t1 < t2); 403 a += 4; 404 b += 4; 405 r += 4; 406 n -= 4; 407 } 408 #endif 409 while (n) { 410 t1 = a[0]; 411 t2 = b[0]; 412 r[0] = (t1 - t2 - c) & BN_MASK2; 413 if (t1 != t2) 414 c = (t1 < t2); 415 a++; 416 b++; 417 r++; 418 n--; 419 } 420 return c; 421 } 422 423 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) 424 425 # undef bn_mul_comba8 426 # undef bn_mul_comba4 427 # undef bn_sqr_comba8 428 # undef bn_sqr_comba4 429 430 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 431 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 432 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 433 /* 434 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 435 * c=(c2,c1,c0) 436 */ 437 438 # ifdef BN_LLONG 439 /* 440 * Keep in mind that additions to multiplication result can not 441 * overflow, because its high half cannot be all-ones. 442 */ 443 # define mul_add_c(a,b,c0,c1,c2) do { \ 444 BN_ULONG hi; \ 445 BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 446 t += c0; /* no carry */ \ 447 c0 = (BN_ULONG)Lw(t); \ 448 hi = (BN_ULONG)Hw(t); \ 449 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 450 } while(0) 451 452 # define mul_add_c2(a,b,c0,c1,c2) do { \ 453 BN_ULONG hi; \ 454 BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 455 BN_ULLONG tt = t+c0; /* no carry */ \ 456 c0 = (BN_ULONG)Lw(tt); \ 457 hi = (BN_ULONG)Hw(tt); \ 458 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 459 t += c0; /* no carry */ \ 460 c0 = (BN_ULONG)Lw(t); \ 461 hi = (BN_ULONG)Hw(t); \ 462 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 463 } while(0) 464 465 # define sqr_add_c(a,i,c0,c1,c2) do { \ 466 BN_ULONG hi; \ 467 BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \ 468 t += c0; /* no carry */ \ 469 c0 = (BN_ULONG)Lw(t); \ 470 hi = (BN_ULONG)Hw(t); \ 471 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 472 } while(0) 473 474 # define sqr_add_c2(a,i,j,c0,c1,c2) \ 475 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 476 477 # elif defined(BN_UMULT_LOHI) 478 /* 479 * Keep in mind that additions to hi can not overflow, because 480 * the high word of a multiplication result cannot be all-ones. 481 */ 482 # define mul_add_c(a,b,c0,c1,c2) do { \ 483 BN_ULONG ta = (a), tb = (b); \ 484 BN_ULONG lo, hi; \ 485 BN_UMULT_LOHI(lo,hi,ta,tb); \ 486 c0 += lo; hi += (c0<lo)?1:0; \ 487 c1 += hi; c2 += (c1<hi)?1:0; \ 488 } while(0) 489 490 # define mul_add_c2(a,b,c0,c1,c2) do { \ 491 BN_ULONG ta = (a), tb = (b); \ 492 BN_ULONG lo, hi, tt; \ 493 BN_UMULT_LOHI(lo,hi,ta,tb); \ 494 c0 += lo; tt = hi+((c0<lo)?1:0); \ 495 c1 += tt; c2 += (c1<tt)?1:0; \ 496 c0 += lo; hi += (c0<lo)?1:0; \ 497 c1 += hi; c2 += (c1<hi)?1:0; \ 498 } while(0) 499 500 # define sqr_add_c(a,i,c0,c1,c2) do { \ 501 BN_ULONG ta = (a)[i]; \ 502 BN_ULONG lo, hi; \ 503 BN_UMULT_LOHI(lo,hi,ta,ta); \ 504 c0 += lo; hi += (c0<lo)?1:0; \ 505 c1 += hi; c2 += (c1<hi)?1:0; \ 506 } while(0) 507 508 # define sqr_add_c2(a,i,j,c0,c1,c2) \ 509 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 510 511 # elif defined(BN_UMULT_HIGH) 512 /* 513 * Keep in mind that additions to hi can not overflow, because 514 * the high word of a multiplication result cannot be all-ones. 515 */ 516 # define mul_add_c(a,b,c0,c1,c2) do { \ 517 BN_ULONG ta = (a), tb = (b); \ 518 BN_ULONG lo = ta * tb; \ 519 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 520 c0 += lo; hi += (c0<lo)?1:0; \ 521 c1 += hi; c2 += (c1<hi)?1:0; \ 522 } while(0) 523 524 # define mul_add_c2(a,b,c0,c1,c2) do { \ 525 BN_ULONG ta = (a), tb = (b), tt; \ 526 BN_ULONG lo = ta * tb; \ 527 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 528 c0 += lo; tt = hi + ((c0<lo)?1:0); \ 529 c1 += tt; c2 += (c1<tt)?1:0; \ 530 c0 += lo; hi += (c0<lo)?1:0; \ 531 c1 += hi; c2 += (c1<hi)?1:0; \ 532 } while(0) 533 534 # define sqr_add_c(a,i,c0,c1,c2) do { \ 535 BN_ULONG ta = (a)[i]; \ 536 BN_ULONG lo = ta * ta; \ 537 BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \ 538 c0 += lo; hi += (c0<lo)?1:0; \ 539 c1 += hi; c2 += (c1<hi)?1:0; \ 540 } while(0) 541 542 # define sqr_add_c2(a,i,j,c0,c1,c2) \ 543 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 544 545 # else /* !BN_LLONG */ 546 /* 547 * Keep in mind that additions to hi can not overflow, because 548 * the high word of a multiplication result cannot be all-ones. 549 */ 550 # define mul_add_c(a,b,c0,c1,c2) do { \ 551 BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 552 BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 553 mul64(lo,hi,bl,bh); \ 554 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 555 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 556 } while(0) 557 558 # define mul_add_c2(a,b,c0,c1,c2) do { \ 559 BN_ULONG tt; \ 560 BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 561 BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 562 mul64(lo,hi,bl,bh); \ 563 tt = hi; \ 564 c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \ 565 c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \ 566 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 567 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 568 } while(0) 569 570 # define sqr_add_c(a,i,c0,c1,c2) do { \ 571 BN_ULONG lo, hi; \ 572 sqr64(lo,hi,(a)[i]); \ 573 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 574 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 575 } while(0) 576 577 # define sqr_add_c2(a,i,j,c0,c1,c2) \ 578 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 579 # endif /* !BN_LLONG */ 580 581 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 582 { 583 BN_ULONG c1, c2, c3; 584 585 c1 = 0; 586 c2 = 0; 587 c3 = 0; 588 mul_add_c(a[0], b[0], c1, c2, c3); 589 r[0] = c1; 590 c1 = 0; 591 mul_add_c(a[0], b[1], c2, c3, c1); 592 mul_add_c(a[1], b[0], c2, c3, c1); 593 r[1] = c2; 594 c2 = 0; 595 mul_add_c(a[2], b[0], c3, c1, c2); 596 mul_add_c(a[1], b[1], c3, c1, c2); 597 mul_add_c(a[0], b[2], c3, c1, c2); 598 r[2] = c3; 599 c3 = 0; 600 mul_add_c(a[0], b[3], c1, c2, c3); 601 mul_add_c(a[1], b[2], c1, c2, c3); 602 mul_add_c(a[2], b[1], c1, c2, c3); 603 mul_add_c(a[3], b[0], c1, c2, c3); 604 r[3] = c1; 605 c1 = 0; 606 mul_add_c(a[4], b[0], c2, c3, c1); 607 mul_add_c(a[3], b[1], c2, c3, c1); 608 mul_add_c(a[2], b[2], c2, c3, c1); 609 mul_add_c(a[1], b[3], c2, c3, c1); 610 mul_add_c(a[0], b[4], c2, c3, c1); 611 r[4] = c2; 612 c2 = 0; 613 mul_add_c(a[0], b[5], c3, c1, c2); 614 mul_add_c(a[1], b[4], c3, c1, c2); 615 mul_add_c(a[2], b[3], c3, c1, c2); 616 mul_add_c(a[3], b[2], c3, c1, c2); 617 mul_add_c(a[4], b[1], c3, c1, c2); 618 mul_add_c(a[5], b[0], c3, c1, c2); 619 r[5] = c3; 620 c3 = 0; 621 mul_add_c(a[6], b[0], c1, c2, c3); 622 mul_add_c(a[5], b[1], c1, c2, c3); 623 mul_add_c(a[4], b[2], c1, c2, c3); 624 mul_add_c(a[3], b[3], c1, c2, c3); 625 mul_add_c(a[2], b[4], c1, c2, c3); 626 mul_add_c(a[1], b[5], c1, c2, c3); 627 mul_add_c(a[0], b[6], c1, c2, c3); 628 r[6] = c1; 629 c1 = 0; 630 mul_add_c(a[0], b[7], c2, c3, c1); 631 mul_add_c(a[1], b[6], c2, c3, c1); 632 mul_add_c(a[2], b[5], c2, c3, c1); 633 mul_add_c(a[3], b[4], c2, c3, c1); 634 mul_add_c(a[4], b[3], c2, c3, c1); 635 mul_add_c(a[5], b[2], c2, c3, c1); 636 mul_add_c(a[6], b[1], c2, c3, c1); 637 mul_add_c(a[7], b[0], c2, c3, c1); 638 r[7] = c2; 639 c2 = 0; 640 mul_add_c(a[7], b[1], c3, c1, c2); 641 mul_add_c(a[6], b[2], c3, c1, c2); 642 mul_add_c(a[5], b[3], c3, c1, c2); 643 mul_add_c(a[4], b[4], c3, c1, c2); 644 mul_add_c(a[3], b[5], c3, c1, c2); 645 mul_add_c(a[2], b[6], c3, c1, c2); 646 mul_add_c(a[1], b[7], c3, c1, c2); 647 r[8] = c3; 648 c3 = 0; 649 mul_add_c(a[2], b[7], c1, c2, c3); 650 mul_add_c(a[3], b[6], c1, c2, c3); 651 mul_add_c(a[4], b[5], c1, c2, c3); 652 mul_add_c(a[5], b[4], c1, c2, c3); 653 mul_add_c(a[6], b[3], c1, c2, c3); 654 mul_add_c(a[7], b[2], c1, c2, c3); 655 r[9] = c1; 656 c1 = 0; 657 mul_add_c(a[7], b[3], c2, c3, c1); 658 mul_add_c(a[6], b[4], c2, c3, c1); 659 mul_add_c(a[5], b[5], c2, c3, c1); 660 mul_add_c(a[4], b[6], c2, c3, c1); 661 mul_add_c(a[3], b[7], c2, c3, c1); 662 r[10] = c2; 663 c2 = 0; 664 mul_add_c(a[4], b[7], c3, c1, c2); 665 mul_add_c(a[5], b[6], c3, c1, c2); 666 mul_add_c(a[6], b[5], c3, c1, c2); 667 mul_add_c(a[7], b[4], c3, c1, c2); 668 r[11] = c3; 669 c3 = 0; 670 mul_add_c(a[7], b[5], c1, c2, c3); 671 mul_add_c(a[6], b[6], c1, c2, c3); 672 mul_add_c(a[5], b[7], c1, c2, c3); 673 r[12] = c1; 674 c1 = 0; 675 mul_add_c(a[6], b[7], c2, c3, c1); 676 mul_add_c(a[7], b[6], c2, c3, c1); 677 r[13] = c2; 678 c2 = 0; 679 mul_add_c(a[7], b[7], c3, c1, c2); 680 r[14] = c3; 681 r[15] = c1; 682 } 683 684 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 685 { 686 BN_ULONG c1, c2, c3; 687 688 c1 = 0; 689 c2 = 0; 690 c3 = 0; 691 mul_add_c(a[0], b[0], c1, c2, c3); 692 r[0] = c1; 693 c1 = 0; 694 mul_add_c(a[0], b[1], c2, c3, c1); 695 mul_add_c(a[1], b[0], c2, c3, c1); 696 r[1] = c2; 697 c2 = 0; 698 mul_add_c(a[2], b[0], c3, c1, c2); 699 mul_add_c(a[1], b[1], c3, c1, c2); 700 mul_add_c(a[0], b[2], c3, c1, c2); 701 r[2] = c3; 702 c3 = 0; 703 mul_add_c(a[0], b[3], c1, c2, c3); 704 mul_add_c(a[1], b[2], c1, c2, c3); 705 mul_add_c(a[2], b[1], c1, c2, c3); 706 mul_add_c(a[3], b[0], c1, c2, c3); 707 r[3] = c1; 708 c1 = 0; 709 mul_add_c(a[3], b[1], c2, c3, c1); 710 mul_add_c(a[2], b[2], c2, c3, c1); 711 mul_add_c(a[1], b[3], c2, c3, c1); 712 r[4] = c2; 713 c2 = 0; 714 mul_add_c(a[2], b[3], c3, c1, c2); 715 mul_add_c(a[3], b[2], c3, c1, c2); 716 r[5] = c3; 717 c3 = 0; 718 mul_add_c(a[3], b[3], c1, c2, c3); 719 r[6] = c1; 720 r[7] = c2; 721 } 722 723 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 724 { 725 BN_ULONG c1, c2, c3; 726 727 c1 = 0; 728 c2 = 0; 729 c3 = 0; 730 sqr_add_c(a, 0, c1, c2, c3); 731 r[0] = c1; 732 c1 = 0; 733 sqr_add_c2(a, 1, 0, c2, c3, c1); 734 r[1] = c2; 735 c2 = 0; 736 sqr_add_c(a, 1, c3, c1, c2); 737 sqr_add_c2(a, 2, 0, c3, c1, c2); 738 r[2] = c3; 739 c3 = 0; 740 sqr_add_c2(a, 3, 0, c1, c2, c3); 741 sqr_add_c2(a, 2, 1, c1, c2, c3); 742 r[3] = c1; 743 c1 = 0; 744 sqr_add_c(a, 2, c2, c3, c1); 745 sqr_add_c2(a, 3, 1, c2, c3, c1); 746 sqr_add_c2(a, 4, 0, c2, c3, c1); 747 r[4] = c2; 748 c2 = 0; 749 sqr_add_c2(a, 5, 0, c3, c1, c2); 750 sqr_add_c2(a, 4, 1, c3, c1, c2); 751 sqr_add_c2(a, 3, 2, c3, c1, c2); 752 r[5] = c3; 753 c3 = 0; 754 sqr_add_c(a, 3, c1, c2, c3); 755 sqr_add_c2(a, 4, 2, c1, c2, c3); 756 sqr_add_c2(a, 5, 1, c1, c2, c3); 757 sqr_add_c2(a, 6, 0, c1, c2, c3); 758 r[6] = c1; 759 c1 = 0; 760 sqr_add_c2(a, 7, 0, c2, c3, c1); 761 sqr_add_c2(a, 6, 1, c2, c3, c1); 762 sqr_add_c2(a, 5, 2, c2, c3, c1); 763 sqr_add_c2(a, 4, 3, c2, c3, c1); 764 r[7] = c2; 765 c2 = 0; 766 sqr_add_c(a, 4, c3, c1, c2); 767 sqr_add_c2(a, 5, 3, c3, c1, c2); 768 sqr_add_c2(a, 6, 2, c3, c1, c2); 769 sqr_add_c2(a, 7, 1, c3, c1, c2); 770 r[8] = c3; 771 c3 = 0; 772 sqr_add_c2(a, 7, 2, c1, c2, c3); 773 sqr_add_c2(a, 6, 3, c1, c2, c3); 774 sqr_add_c2(a, 5, 4, c1, c2, c3); 775 r[9] = c1; 776 c1 = 0; 777 sqr_add_c(a, 5, c2, c3, c1); 778 sqr_add_c2(a, 6, 4, c2, c3, c1); 779 sqr_add_c2(a, 7, 3, c2, c3, c1); 780 r[10] = c2; 781 c2 = 0; 782 sqr_add_c2(a, 7, 4, c3, c1, c2); 783 sqr_add_c2(a, 6, 5, c3, c1, c2); 784 r[11] = c3; 785 c3 = 0; 786 sqr_add_c(a, 6, c1, c2, c3); 787 sqr_add_c2(a, 7, 5, c1, c2, c3); 788 r[12] = c1; 789 c1 = 0; 790 sqr_add_c2(a, 7, 6, c2, c3, c1); 791 r[13] = c2; 792 c2 = 0; 793 sqr_add_c(a, 7, c3, c1, c2); 794 r[14] = c3; 795 r[15] = c1; 796 } 797 798 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 799 { 800 BN_ULONG c1, c2, c3; 801 802 c1 = 0; 803 c2 = 0; 804 c3 = 0; 805 sqr_add_c(a, 0, c1, c2, c3); 806 r[0] = c1; 807 c1 = 0; 808 sqr_add_c2(a, 1, 0, c2, c3, c1); 809 r[1] = c2; 810 c2 = 0; 811 sqr_add_c(a, 1, c3, c1, c2); 812 sqr_add_c2(a, 2, 0, c3, c1, c2); 813 r[2] = c3; 814 c3 = 0; 815 sqr_add_c2(a, 3, 0, c1, c2, c3); 816 sqr_add_c2(a, 2, 1, c1, c2, c3); 817 r[3] = c1; 818 c1 = 0; 819 sqr_add_c(a, 2, c2, c3, c1); 820 sqr_add_c2(a, 3, 1, c2, c3, c1); 821 r[4] = c2; 822 c2 = 0; 823 sqr_add_c2(a, 3, 2, c3, c1, c2); 824 r[5] = c3; 825 c3 = 0; 826 sqr_add_c(a, 3, c1, c2, c3); 827 r[6] = c1; 828 r[7] = c2; 829 } 830 831 # ifdef OPENSSL_NO_ASM 832 # ifdef OPENSSL_BN_ASM_MONT 833 # include <alloca.h> 834 /* 835 * This is essentially reference implementation, which may or may not 836 * result in performance improvement. E.g. on IA-32 this routine was 837 * observed to give 40% faster rsa1024 private key operations and 10% 838 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only 839 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a 840 * reference implementation, one to be used as starting point for 841 * platform-specific assembler. Mentioned numbers apply to compiler 842 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and 843 * can vary not only from platform to platform, but even for compiler 844 * versions. Assembler vs. assembler improvement coefficients can 845 * [and are known to] differ and are to be documented elsewhere. 846 */ 847 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 848 const BN_ULONG *np, const BN_ULONG *n0p, int num) 849 { 850 BN_ULONG c0, c1, ml, *tp, n0; 851 # ifdef mul64 852 BN_ULONG mh; 853 # endif 854 volatile BN_ULONG *vp; 855 int i = 0, j; 856 857 # if 0 /* template for platform-specific 858 * implementation */ 859 if (ap == bp) 860 return bn_sqr_mont(rp, ap, np, n0p, num); 861 # endif 862 vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 863 864 n0 = *n0p; 865 866 c0 = 0; 867 ml = bp[0]; 868 # ifdef mul64 869 mh = HBITS(ml); 870 ml = LBITS(ml); 871 for (j = 0; j < num; ++j) 872 mul(tp[j], ap[j], ml, mh, c0); 873 # else 874 for (j = 0; j < num; ++j) 875 mul(tp[j], ap[j], ml, c0); 876 # endif 877 878 tp[num] = c0; 879 tp[num + 1] = 0; 880 goto enter; 881 882 for (i = 0; i < num; i++) { 883 c0 = 0; 884 ml = bp[i]; 885 # ifdef mul64 886 mh = HBITS(ml); 887 ml = LBITS(ml); 888 for (j = 0; j < num; ++j) 889 mul_add(tp[j], ap[j], ml, mh, c0); 890 # else 891 for (j = 0; j < num; ++j) 892 mul_add(tp[j], ap[j], ml, c0); 893 # endif 894 c1 = (tp[num] + c0) & BN_MASK2; 895 tp[num] = c1; 896 tp[num + 1] = (c1 < c0 ? 1 : 0); 897 enter: 898 c1 = tp[0]; 899 ml = (c1 * n0) & BN_MASK2; 900 c0 = 0; 901 # ifdef mul64 902 mh = HBITS(ml); 903 ml = LBITS(ml); 904 mul_add(c1, np[0], ml, mh, c0); 905 # else 906 mul_add(c1, ml, np[0], c0); 907 # endif 908 for (j = 1; j < num; j++) { 909 c1 = tp[j]; 910 # ifdef mul64 911 mul_add(c1, np[j], ml, mh, c0); 912 # else 913 mul_add(c1, ml, np[j], c0); 914 # endif 915 tp[j - 1] = c1 & BN_MASK2; 916 } 917 c1 = (tp[num] + c0) & BN_MASK2; 918 tp[num - 1] = c1; 919 tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); 920 } 921 922 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 923 c0 = bn_sub_words(rp, tp, np, num); 924 if (tp[num] != 0 || c0 == 0) { 925 for (i = 0; i < num + 2; i++) 926 vp[i] = 0; 927 return 1; 928 } 929 } 930 for (i = 0; i < num; i++) 931 rp[i] = tp[i], vp[i] = 0; 932 vp[num] = 0; 933 vp[num + 1] = 0; 934 return 1; 935 } 936 # else 937 /* 938 * Return value of 0 indicates that multiplication/convolution was not 939 * performed to signal the caller to fall down to alternative/original 940 * code-path. 941 */ 942 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 943 const BN_ULONG *np, const BN_ULONG *n0, int num) 944 { 945 return 0; 946 } 947 # endif /* OPENSSL_BN_ASM_MONT */ 948 # endif 949 950 #else /* !BN_MUL_COMBA */ 951 952 /* hmm... is it faster just to do a multiply? */ 953 # undef bn_sqr_comba4 954 # undef bn_sqr_comba8 955 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 956 { 957 BN_ULONG t[8]; 958 bn_sqr_normal(r, a, 4, t); 959 } 960 961 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 962 { 963 BN_ULONG t[16]; 964 bn_sqr_normal(r, a, 8, t); 965 } 966 967 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 968 { 969 r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); 970 r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); 971 r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); 972 r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); 973 } 974 975 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 976 { 977 r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); 978 r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); 979 r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); 980 r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); 981 r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); 982 r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); 983 r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); 984 r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); 985 } 986 987 # ifdef OPENSSL_NO_ASM 988 # ifdef OPENSSL_BN_ASM_MONT 989 # include <alloca.h> 990 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 991 const BN_ULONG *np, const BN_ULONG *n0p, int num) 992 { 993 BN_ULONG c0, c1, *tp, n0 = *n0p; 994 volatile BN_ULONG *vp; 995 int i = 0, j; 996 997 vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 998 999 for (i = 0; i <= num; i++) 1000 tp[i] = 0; 1001 1002 for (i = 0; i < num; i++) { 1003 c0 = bn_mul_add_words(tp, ap, num, bp[i]); 1004 c1 = (tp[num] + c0) & BN_MASK2; 1005 tp[num] = c1; 1006 tp[num + 1] = (c1 < c0 ? 1 : 0); 1007 1008 c0 = bn_mul_add_words(tp, np, num, tp[0] * n0); 1009 c1 = (tp[num] + c0) & BN_MASK2; 1010 tp[num] = c1; 1011 tp[num + 1] += (c1 < c0 ? 1 : 0); 1012 for (j = 0; j <= num; j++) 1013 tp[j] = tp[j + 1]; 1014 } 1015 1016 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 1017 c0 = bn_sub_words(rp, tp, np, num); 1018 if (tp[num] != 0 || c0 == 0) { 1019 for (i = 0; i < num + 2; i++) 1020 vp[i] = 0; 1021 return 1; 1022 } 1023 } 1024 for (i = 0; i < num; i++) 1025 rp[i] = tp[i], vp[i] = 0; 1026 vp[num] = 0; 1027 vp[num + 1] = 0; 1028 return 1; 1029 } 1030 # else 1031 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1032 const BN_ULONG *np, const BN_ULONG *n0, int num) 1033 { 1034 return 0; 1035 } 1036 # endif /* OPENSSL_BN_ASM_MONT */ 1037 # endif 1038 1039 #endif /* !BN_MUL_COMBA */ 1040