1 /* $OpenBSD: bn_asm.c,v 1.13 2014/07/11 08:44:47 jsing Exp $ */ 2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 3 * All rights reserved. 4 * 5 * This package is an SSL implementation written 6 * by Eric Young (eay@cryptsoft.com). 7 * The implementation was written so as to conform with Netscapes SSL. 8 * 9 * This library is free for commercial and non-commercial use as long as 10 * the following conditions are aheared to. The following conditions 11 * apply to all code found in this distribution, be it the RC4, RSA, 12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation 13 * included with this distribution is covered by the same copyright terms 14 * except that the holder is Tim Hudson (tjh@cryptsoft.com). 15 * 16 * Copyright remains Eric Young's, and as such any Copyright notices in 17 * the code are not to be removed. 18 * If this package is used in a product, Eric Young should be given attribution 19 * as the author of the parts of the library used. 20 * This can be in the form of a textual message at program startup or 21 * in documentation (online or textual) provided with the package. 22 * 23 * Redistribution and use in source and binary forms, with or without 24 * modification, are permitted provided that the following conditions 25 * are met: 26 * 1. Redistributions of source code must retain the copyright 27 * notice, this list of conditions and the following disclaimer. 28 * 2. Redistributions in binary form must reproduce the above copyright 29 * notice, this list of conditions and the following disclaimer in the 30 * documentation and/or other materials provided with the distribution. 31 * 3. All advertising materials mentioning features or use of this software 32 * must display the following acknowledgement: 33 * "This product includes cryptographic software written by 34 * Eric Young (eay@cryptsoft.com)" 35 * The word 'cryptographic' can be left out if the rouines from the library 36 * being used are not cryptographic related :-). 37 * 4. If you include any Windows specific code (or a derivative thereof) from 38 * the apps directory (application code) you must include an acknowledgement: 39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" 40 * 41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND 42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 51 * SUCH DAMAGE. 52 * 53 * The licence and distribution terms for any publically available version or 54 * derivative of this code cannot be changed. i.e. this code cannot simply be 55 * copied and put under another distribution licence 56 * [including the GNU Public Licence.] 57 */ 58 59 #ifndef BN_DEBUG 60 # undef NDEBUG /* avoid conflicting definitions */ 61 # define NDEBUG 62 #endif 63 64 #include <assert.h> 65 #include <stdio.h> 66 67 #include <openssl/opensslconf.h> 68 69 #include "bn_lcl.h" 70 71 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) 72 73 BN_ULONG 74 bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 75 { 76 BN_ULONG c1 = 0; 77 78 assert(num >= 0); 79 if (num <= 0) 80 return (c1); 81 82 #ifndef OPENSSL_SMALL_FOOTPRINT 83 while (num & ~3) { 84 mul_add(rp[0], ap[0], w, c1); 85 mul_add(rp[1], ap[1], w, c1); 86 mul_add(rp[2], ap[2], w, c1); 87 mul_add(rp[3], ap[3], w, c1); 88 ap += 4; 89 rp += 4; 90 num -= 4; 91 } 92 #endif 93 while (num) { 94 mul_add(rp[0], ap[0], w, c1); 95 ap++; 96 rp++; 97 num--; 98 } 99 100 return (c1); 101 } 102 103 BN_ULONG 104 bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 105 { 106 BN_ULONG c1 = 0; 107 108 assert(num >= 0); 109 if (num <= 0) 110 return (c1); 111 112 #ifndef OPENSSL_SMALL_FOOTPRINT 113 while (num & ~3) { 114 mul(rp[0], ap[0], w, c1); 115 mul(rp[1], ap[1], w, c1); 116 mul(rp[2], ap[2], w, c1); 117 mul(rp[3], ap[3], w, c1); 118 ap += 4; 119 rp += 4; 120 num -= 4; 121 } 122 #endif 123 while (num) { 124 mul(rp[0], ap[0], w, c1); 125 ap++; 126 rp++; 127 num--; 128 } 129 return (c1); 130 } 131 132 void 133 bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 134 { 135 assert(n >= 0); 136 if (n <= 0) 137 return; 138 139 #ifndef OPENSSL_SMALL_FOOTPRINT 140 while (n & ~3) { 141 sqr(r[0], r[1], a[0]); 142 sqr(r[2], r[3], a[1]); 143 sqr(r[4], r[5], a[2]); 144 sqr(r[6], r[7], a[3]); 145 a += 4; 146 r += 8; 147 n -= 4; 148 } 149 #endif 150 while (n) { 151 sqr(r[0], r[1], a[0]); 152 a++; 153 r += 2; 154 n--; 155 } 156 } 157 158 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ 159 160 BN_ULONG 161 bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 162 { 163 BN_ULONG c = 0; 164 BN_ULONG bl, bh; 165 166 assert(num >= 0); 167 if (num <= 0) 168 return ((BN_ULONG)0); 169 170 bl = LBITS(w); 171 bh = HBITS(w); 172 173 #ifndef OPENSSL_SMALL_FOOTPRINT 174 while (num & ~3) { 175 mul_add(rp[0], ap[0], bl, bh, c); 176 mul_add(rp[1], ap[1], bl, bh, c); 177 mul_add(rp[2], ap[2], bl, bh, c); 178 mul_add(rp[3], ap[3], bl, bh, c); 179 ap += 4; 180 rp += 4; 181 num -= 4; 182 } 183 #endif 184 while (num) { 185 mul_add(rp[0], ap[0], bl, bh, c); 186 ap++; 187 rp++; 188 num--; 189 } 190 return (c); 191 } 192 193 BN_ULONG 194 bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 195 { 196 BN_ULONG carry = 0; 197 BN_ULONG bl, bh; 198 199 assert(num >= 0); 200 if (num <= 0) 201 return ((BN_ULONG)0); 202 203 bl = LBITS(w); 204 bh = HBITS(w); 205 206 #ifndef OPENSSL_SMALL_FOOTPRINT 207 while (num & ~3) { 208 mul(rp[0], ap[0], bl, bh, carry); 209 mul(rp[1], ap[1], bl, bh, carry); 210 mul(rp[2], ap[2], bl, bh, carry); 211 mul(rp[3], ap[3], bl, bh, carry); 212 ap += 4; 213 rp += 4; 214 num -= 4; 215 } 216 #endif 217 while (num) { 218 mul(rp[0], ap[0], bl, bh, carry); 219 ap++; 220 rp++; 221 num--; 222 } 223 return (carry); 224 } 225 226 void 227 bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 228 { 229 assert(n >= 0); 230 if (n <= 0) 231 return; 232 233 #ifndef OPENSSL_SMALL_FOOTPRINT 234 while (n & ~3) { 235 sqr64(r[0], r[1], a[0]); 236 sqr64(r[2], r[3], a[1]); 237 sqr64(r[4], r[5], a[2]); 238 sqr64(r[6], r[7], a[3]); 239 a += 4; 240 r += 8; 241 n -= 4; 242 } 243 #endif 244 while (n) { 245 sqr64(r[0], r[1], a[0]); 246 a++; 247 r += 2; 248 n--; 249 } 250 } 251 252 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ 253 254 #if defined(BN_LLONG) && defined(BN_DIV2W) 255 256 BN_ULONG 257 bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 258 { 259 return ((BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2)|l)/(BN_ULLONG)d)); 260 } 261 262 #else 263 264 /* Divide h,l by d and return the result. */ 265 /* I need to test this some more :-( */ 266 BN_ULONG 267 bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 268 { 269 BN_ULONG dh, dl, q,ret = 0, th, tl, t; 270 int i, count = 2; 271 272 if (d == 0) 273 return (BN_MASK2); 274 275 i = BN_num_bits_word(d); 276 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); 277 278 i = BN_BITS2 - i; 279 if (h >= d) 280 h -= d; 281 282 if (i) { 283 d <<= i; 284 h = (h << i) | (l >> (BN_BITS2 - i)); 285 l <<= i; 286 } 287 dh = (d & BN_MASK2h) >> BN_BITS4; 288 dl = (d & BN_MASK2l); 289 for (;;) { 290 if ((h >> BN_BITS4) == dh) 291 q = BN_MASK2l; 292 else 293 q = h / dh; 294 295 th = q * dh; 296 tl = dl * q; 297 for (;;) { 298 t = h - th; 299 if ((t & BN_MASK2h) || 300 ((tl) <= ( 301 (t << BN_BITS4) | 302 ((l & BN_MASK2h) >> BN_BITS4)))) 303 break; 304 q--; 305 th -= dh; 306 tl -= dl; 307 } 308 t = (tl >> BN_BITS4); 309 tl = (tl << BN_BITS4) & BN_MASK2h; 310 th += t; 311 312 if (l < tl) 313 th++; 314 l -= tl; 315 if (h < th) { 316 h += d; 317 q--; 318 } 319 h -= th; 320 321 if (--count == 0) 322 break; 323 324 ret = q << BN_BITS4; 325 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; 326 l = (l & BN_MASK2l) << BN_BITS4; 327 } 328 ret |= q; 329 return (ret); 330 } 331 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ 332 333 #ifdef BN_LLONG 334 BN_ULONG 335 bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) 336 { 337 BN_ULLONG ll = 0; 338 339 assert(n >= 0); 340 if (n <= 0) 341 return ((BN_ULONG)0); 342 343 #ifndef OPENSSL_SMALL_FOOTPRINT 344 while (n & ~3) { 345 ll += (BN_ULLONG)a[0] + b[0]; 346 r[0] = (BN_ULONG)ll & BN_MASK2; 347 ll >>= BN_BITS2; 348 ll += (BN_ULLONG)a[1] + b[1]; 349 r[1] = (BN_ULONG)ll & BN_MASK2; 350 ll >>= BN_BITS2; 351 ll += (BN_ULLONG)a[2] + b[2]; 352 r[2] = (BN_ULONG)ll & BN_MASK2; 353 ll >>= BN_BITS2; 354 ll += (BN_ULLONG)a[3] + b[3]; 355 r[3] = (BN_ULONG)ll & BN_MASK2; 356 ll >>= BN_BITS2; 357 a += 4; 358 b += 4; 359 r += 4; 360 n -= 4; 361 } 362 #endif 363 while (n) { 364 ll += (BN_ULLONG)a[0] + b[0]; 365 r[0] = (BN_ULONG)ll & BN_MASK2; 366 ll >>= BN_BITS2; 367 a++; 368 b++; 369 r++; 370 n--; 371 } 372 return ((BN_ULONG)ll); 373 } 374 #else /* !BN_LLONG */ 375 BN_ULONG 376 bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) 377 { 378 BN_ULONG c, l, t; 379 380 assert(n >= 0); 381 if (n <= 0) 382 return ((BN_ULONG)0); 383 384 c = 0; 385 #ifndef OPENSSL_SMALL_FOOTPRINT 386 while (n & ~3) { 387 t = a[0]; 388 t = (t + c) & BN_MASK2; 389 c = (t < c); 390 l = (t + b[0]) & BN_MASK2; 391 c += (l < t); 392 r[0] = l; 393 t = a[1]; 394 t = (t + c) & BN_MASK2; 395 c = (t < c); 396 l = (t + b[1]) & BN_MASK2; 397 c += (l < t); 398 r[1] = l; 399 t = a[2]; 400 t = (t + c) & BN_MASK2; 401 c = (t < c); 402 l = (t + b[2]) & BN_MASK2; 403 c += (l < t); 404 r[2] = l; 405 t = a[3]; 406 t = (t + c) & BN_MASK2; 407 c = (t < c); 408 l = (t + b[3]) & BN_MASK2; 409 c += (l < t); 410 r[3] = l; 411 a += 4; 412 b += 4; 413 r += 4; 414 n -= 4; 415 } 416 #endif 417 while (n) { 418 t = a[0]; 419 t = (t + c) & BN_MASK2; 420 c = (t < c); 421 l = (t + b[0]) & BN_MASK2; 422 c += (l < t); 423 r[0] = l; 424 a++; 425 b++; 426 r++; 427 n--; 428 } 429 return ((BN_ULONG)c); 430 } 431 #endif /* !BN_LLONG */ 432 433 BN_ULONG 434 bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) 435 { 436 BN_ULONG t1, t2; 437 int c = 0; 438 439 assert(n >= 0); 440 if (n <= 0) 441 return ((BN_ULONG)0); 442 443 #ifndef OPENSSL_SMALL_FOOTPRINT 444 while (n&~3) { 445 t1 = a[0]; 446 t2 = b[0]; 447 r[0] = (t1 - t2 - c) & BN_MASK2; 448 if (t1 != t2) 449 c = (t1 < t2); 450 t1 = a[1]; 451 t2 = b[1]; 452 r[1] = (t1 - t2 - c) & BN_MASK2; 453 if (t1 != t2) 454 c = (t1 < t2); 455 t1 = a[2]; 456 t2 = b[2]; 457 r[2] = (t1 - t2 - c) & BN_MASK2; 458 if (t1 != t2) 459 c = (t1 < t2); 460 t1 = a[3]; 461 t2 = b[3]; 462 r[3] = (t1 - t2 - c) & BN_MASK2; 463 if (t1 != t2) 464 c = (t1 < t2); 465 a += 4; 466 b += 4; 467 r += 4; 468 n -= 4; 469 } 470 #endif 471 while (n) { 472 t1 = a[0]; 473 t2 = b[0]; 474 r[0] = (t1 - t2 - c) & BN_MASK2; 475 if (t1 != t2) 476 c = (t1 < t2); 477 a++; 478 b++; 479 r++; 480 n--; 481 } 482 return (c); 483 } 484 485 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) 486 487 #undef bn_mul_comba8 488 #undef bn_mul_comba4 489 #undef bn_sqr_comba8 490 #undef bn_sqr_comba4 491 492 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 493 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 494 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 495 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ 496 497 #ifdef BN_LLONG 498 /* 499 * Keep in mind that additions to multiplication result can not 500 * overflow, because its high half cannot be all-ones. 501 */ 502 #define mul_add_c(a,b,c0,c1,c2) do { \ 503 BN_ULONG hi; \ 504 BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 505 t += c0; /* no carry */ \ 506 c0 = (BN_ULONG)Lw(t); \ 507 hi = (BN_ULONG)Hw(t); \ 508 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 509 } while(0) 510 511 #define mul_add_c2(a,b,c0,c1,c2) do { \ 512 BN_ULONG hi; \ 513 BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 514 BN_ULLONG tt = t+c0; /* no carry */ \ 515 c0 = (BN_ULONG)Lw(tt); \ 516 hi = (BN_ULONG)Hw(tt); \ 517 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 518 t += c0; /* no carry */ \ 519 c0 = (BN_ULONG)Lw(t); \ 520 hi = (BN_ULONG)Hw(t); \ 521 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 522 } while(0) 523 524 #define sqr_add_c(a,i,c0,c1,c2) do { \ 525 BN_ULONG hi; \ 526 BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \ 527 t += c0; /* no carry */ \ 528 c0 = (BN_ULONG)Lw(t); \ 529 hi = (BN_ULONG)Hw(t); \ 530 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 531 } while(0) 532 533 #define sqr_add_c2(a,i,j,c0,c1,c2) \ 534 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 535 536 #elif defined(BN_UMULT_LOHI) 537 /* 538 * Keep in mind that additions to hi can not overflow, because 539 * the high word of a multiplication result cannot be all-ones. 540 */ 541 #define mul_add_c(a,b,c0,c1,c2) do { \ 542 BN_ULONG ta = (a), tb = (b); \ 543 BN_ULONG lo, hi; \ 544 BN_UMULT_LOHI(lo,hi,ta,tb); \ 545 c0 += lo; hi += (c0<lo)?1:0; \ 546 c1 += hi; c2 += (c1<hi)?1:0; \ 547 } while(0) 548 549 #define mul_add_c2(a,b,c0,c1,c2) do { \ 550 BN_ULONG ta = (a), tb = (b); \ 551 BN_ULONG lo, hi, tt; \ 552 BN_UMULT_LOHI(lo,hi,ta,tb); \ 553 c0 += lo; tt = hi+((c0<lo)?1:0); \ 554 c1 += tt; c2 += (c1<tt)?1:0; \ 555 c0 += lo; hi += (c0<lo)?1:0; \ 556 c1 += hi; c2 += (c1<hi)?1:0; \ 557 } while(0) 558 559 #define sqr_add_c(a,i,c0,c1,c2) do { \ 560 BN_ULONG ta = (a)[i]; \ 561 BN_ULONG lo, hi; \ 562 BN_UMULT_LOHI(lo,hi,ta,ta); \ 563 c0 += lo; hi += (c0<lo)?1:0; \ 564 c1 += hi; c2 += (c1<hi)?1:0; \ 565 } while(0) 566 567 #define sqr_add_c2(a,i,j,c0,c1,c2) \ 568 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 569 570 #elif defined(BN_UMULT_HIGH) 571 /* 572 * Keep in mind that additions to hi can not overflow, because 573 * the high word of a multiplication result cannot be all-ones. 574 */ 575 #define mul_add_c(a,b,c0,c1,c2) do { \ 576 BN_ULONG ta = (a), tb = (b); \ 577 BN_ULONG lo = ta * tb; \ 578 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 579 c0 += lo; hi += (c0<lo)?1:0; \ 580 c1 += hi; c2 += (c1<hi)?1:0; \ 581 } while(0) 582 583 #define mul_add_c2(a,b,c0,c1,c2) do { \ 584 BN_ULONG ta = (a), tb = (b), tt; \ 585 BN_ULONG lo = ta * tb; \ 586 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 587 c0 += lo; tt = hi + ((c0<lo)?1:0); \ 588 c1 += tt; c2 += (c1<tt)?1:0; \ 589 c0 += lo; hi += (c0<lo)?1:0; \ 590 c1 += hi; c2 += (c1<hi)?1:0; \ 591 } while(0) 592 593 #define sqr_add_c(a,i,c0,c1,c2) do { \ 594 BN_ULONG ta = (a)[i]; \ 595 BN_ULONG lo = ta * ta; \ 596 BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \ 597 c0 += lo; hi += (c0<lo)?1:0; \ 598 c1 += hi; c2 += (c1<hi)?1:0; \ 599 } while(0) 600 601 #define sqr_add_c2(a,i,j,c0,c1,c2) \ 602 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 603 604 #else /* !BN_LLONG */ 605 /* 606 * Keep in mind that additions to hi can not overflow, because 607 * the high word of a multiplication result cannot be all-ones. 608 */ 609 #define mul_add_c(a,b,c0,c1,c2) do { \ 610 BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 611 BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 612 mul64(lo,hi,bl,bh); \ 613 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 614 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 615 } while(0) 616 617 #define mul_add_c2(a,b,c0,c1,c2) do { \ 618 BN_ULONG tt; \ 619 BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 620 BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 621 mul64(lo,hi,bl,bh); \ 622 tt = hi; \ 623 c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \ 624 c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \ 625 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 626 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 627 } while(0) 628 629 #define sqr_add_c(a,i,c0,c1,c2) do { \ 630 BN_ULONG lo, hi; \ 631 sqr64(lo,hi,(a)[i]); \ 632 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 633 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 634 } while(0) 635 636 #define sqr_add_c2(a,i,j,c0,c1,c2) \ 637 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 638 #endif /* !BN_LLONG */ 639 640 void 641 bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 642 { 643 BN_ULONG c1, c2, c3; 644 645 c1 = 0; 646 c2 = 0; 647 c3 = 0; 648 mul_add_c(a[0], b[0], c1, c2, c3); 649 r[0] = c1; 650 c1 = 0; 651 mul_add_c(a[0], b[1], c2, c3, c1); 652 mul_add_c(a[1], b[0], c2, c3, c1); 653 r[1] = c2; 654 c2 = 0; 655 mul_add_c(a[2], b[0], c3, c1, c2); 656 mul_add_c(a[1], b[1], c3, c1, c2); 657 mul_add_c(a[0], b[2], c3, c1, c2); 658 r[2] = c3; 659 c3 = 0; 660 mul_add_c(a[0], b[3], c1, c2, c3); 661 mul_add_c(a[1], b[2], c1, c2, c3); 662 mul_add_c(a[2], b[1], c1, c2, c3); 663 mul_add_c(a[3], b[0], c1, c2, c3); 664 r[3] = c1; 665 c1 = 0; 666 mul_add_c(a[4], b[0], c2, c3, c1); 667 mul_add_c(a[3], b[1], c2, c3, c1); 668 mul_add_c(a[2], b[2], c2, c3, c1); 669 mul_add_c(a[1], b[3], c2, c3, c1); 670 mul_add_c(a[0], b[4], c2, c3, c1); 671 r[4] = c2; 672 c2 = 0; 673 mul_add_c(a[0], b[5], c3, c1, c2); 674 mul_add_c(a[1], b[4], c3, c1, c2); 675 mul_add_c(a[2], b[3], c3, c1, c2); 676 mul_add_c(a[3], b[2], c3, c1, c2); 677 mul_add_c(a[4], b[1], c3, c1, c2); 678 mul_add_c(a[5], b[0], c3, c1, c2); 679 r[5] = c3; 680 c3 = 0; 681 mul_add_c(a[6], b[0], c1, c2, c3); 682 mul_add_c(a[5], b[1], c1, c2, c3); 683 mul_add_c(a[4], b[2], c1, c2, c3); 684 mul_add_c(a[3], b[3], c1, c2, c3); 685 mul_add_c(a[2], b[4], c1, c2, c3); 686 mul_add_c(a[1], b[5], c1, c2, c3); 687 mul_add_c(a[0], b[6], c1, c2, c3); 688 r[6] = c1; 689 c1 = 0; 690 mul_add_c(a[0], b[7], c2, c3, c1); 691 mul_add_c(a[1], b[6], c2, c3, c1); 692 mul_add_c(a[2], b[5], c2, c3, c1); 693 mul_add_c(a[3], b[4], c2, c3, c1); 694 mul_add_c(a[4], b[3], c2, c3, c1); 695 mul_add_c(a[5], b[2], c2, c3, c1); 696 mul_add_c(a[6], b[1], c2, c3, c1); 697 mul_add_c(a[7], b[0], c2, c3, c1); 698 r[7] = c2; 699 c2 = 0; 700 mul_add_c(a[7], b[1], c3, c1, c2); 701 mul_add_c(a[6], b[2], c3, c1, c2); 702 mul_add_c(a[5], b[3], c3, c1, c2); 703 mul_add_c(a[4], b[4], c3, c1, c2); 704 mul_add_c(a[3], b[5], c3, c1, c2); 705 mul_add_c(a[2], b[6], c3, c1, c2); 706 mul_add_c(a[1], b[7], c3, c1, c2); 707 r[8] = c3; 708 c3 = 0; 709 mul_add_c(a[2], b[7], c1, c2, c3); 710 mul_add_c(a[3], b[6], c1, c2, c3); 711 mul_add_c(a[4], b[5], c1, c2, c3); 712 mul_add_c(a[5], b[4], c1, c2, c3); 713 mul_add_c(a[6], b[3], c1, c2, c3); 714 mul_add_c(a[7], b[2], c1, c2, c3); 715 r[9] = c1; 716 c1 = 0; 717 mul_add_c(a[7], b[3], c2, c3, c1); 718 mul_add_c(a[6], b[4], c2, c3, c1); 719 mul_add_c(a[5], b[5], c2, c3, c1); 720 mul_add_c(a[4], b[6], c2, c3, c1); 721 mul_add_c(a[3], b[7], c2, c3, c1); 722 r[10] = c2; 723 c2 = 0; 724 mul_add_c(a[4], b[7], c3, c1, c2); 725 mul_add_c(a[5], b[6], c3, c1, c2); 726 mul_add_c(a[6], b[5], c3, c1, c2); 727 mul_add_c(a[7], b[4], c3, c1, c2); 728 r[11] = c3; 729 c3 = 0; 730 mul_add_c(a[7], b[5], c1, c2, c3); 731 mul_add_c(a[6], b[6], c1, c2, c3); 732 mul_add_c(a[5], b[7], c1, c2, c3); 733 r[12] = c1; 734 c1 = 0; 735 mul_add_c(a[6], b[7], c2, c3, c1); 736 mul_add_c(a[7], b[6], c2, c3, c1); 737 r[13] = c2; 738 c2 = 0; 739 mul_add_c(a[7], b[7], c3, c1, c2); 740 r[14] = c3; 741 r[15] = c1; 742 } 743 744 void 745 bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 746 { 747 BN_ULONG c1, c2, c3; 748 749 c1 = 0; 750 c2 = 0; 751 c3 = 0; 752 mul_add_c(a[0], b[0], c1, c2, c3); 753 r[0] = c1; 754 c1 = 0; 755 mul_add_c(a[0], b[1], c2, c3, c1); 756 mul_add_c(a[1], b[0], c2, c3, c1); 757 r[1] = c2; 758 c2 = 0; 759 mul_add_c(a[2], b[0], c3, c1, c2); 760 mul_add_c(a[1], b[1], c3, c1, c2); 761 mul_add_c(a[0], b[2], c3, c1, c2); 762 r[2] = c3; 763 c3 = 0; 764 mul_add_c(a[0], b[3], c1, c2, c3); 765 mul_add_c(a[1], b[2], c1, c2, c3); 766 mul_add_c(a[2], b[1], c1, c2, c3); 767 mul_add_c(a[3], b[0], c1, c2, c3); 768 r[3] = c1; 769 c1 = 0; 770 mul_add_c(a[3], b[1], c2, c3, c1); 771 mul_add_c(a[2], b[2], c2, c3, c1); 772 mul_add_c(a[1], b[3], c2, c3, c1); 773 r[4] = c2; 774 c2 = 0; 775 mul_add_c(a[2], b[3], c3, c1, c2); 776 mul_add_c(a[3], b[2], c3, c1, c2); 777 r[5] = c3; 778 c3 = 0; 779 mul_add_c(a[3], b[3], c1, c2, c3); 780 r[6] = c1; 781 r[7] = c2; 782 } 783 784 void 785 bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 786 { 787 BN_ULONG c1, c2, c3; 788 789 c1 = 0; 790 c2 = 0; 791 c3 = 0; 792 sqr_add_c(a, 0, c1, c2, c3); 793 r[0] = c1; 794 c1 = 0; 795 sqr_add_c2(a, 1, 0, c2, c3, c1); 796 r[1] = c2; 797 c2 = 0; 798 sqr_add_c(a, 1, c3, c1, c2); 799 sqr_add_c2(a, 2, 0, c3, c1, c2); 800 r[2] = c3; 801 c3 = 0; 802 sqr_add_c2(a, 3, 0, c1, c2, c3); 803 sqr_add_c2(a, 2, 1, c1, c2, c3); 804 r[3] = c1; 805 c1 = 0; 806 sqr_add_c(a, 2, c2, c3, c1); 807 sqr_add_c2(a, 3, 1, c2, c3, c1); 808 sqr_add_c2(a, 4, 0, c2, c3, c1); 809 r[4] = c2; 810 c2 = 0; 811 sqr_add_c2(a, 5, 0, c3, c1, c2); 812 sqr_add_c2(a, 4, 1, c3, c1, c2); 813 sqr_add_c2(a, 3, 2, c3, c1, c2); 814 r[5] = c3; 815 c3 = 0; 816 sqr_add_c(a, 3, c1, c2, c3); 817 sqr_add_c2(a, 4, 2, c1, c2, c3); 818 sqr_add_c2(a, 5, 1, c1, c2, c3); 819 sqr_add_c2(a, 6, 0, c1, c2, c3); 820 r[6] = c1; 821 c1 = 0; 822 sqr_add_c2(a, 7, 0, c2, c3, c1); 823 sqr_add_c2(a, 6, 1, c2, c3, c1); 824 sqr_add_c2(a, 5, 2, c2, c3, c1); 825 sqr_add_c2(a, 4, 3, c2, c3, c1); 826 r[7] = c2; 827 c2 = 0; 828 sqr_add_c(a, 4, c3, c1, c2); 829 sqr_add_c2(a, 5, 3, c3, c1, c2); 830 sqr_add_c2(a, 6, 2, c3, c1, c2); 831 sqr_add_c2(a, 7, 1, c3, c1, c2); 832 r[8] = c3; 833 c3 = 0; 834 sqr_add_c2(a, 7, 2, c1, c2, c3); 835 sqr_add_c2(a, 6, 3, c1, c2, c3); 836 sqr_add_c2(a, 5, 4, c1, c2, c3); 837 r[9] = c1; 838 c1 = 0; 839 sqr_add_c(a, 5, c2, c3, c1); 840 sqr_add_c2(a, 6, 4, c2, c3, c1); 841 sqr_add_c2(a, 7, 3, c2, c3, c1); 842 r[10] = c2; 843 c2 = 0; 844 sqr_add_c2(a, 7, 4, c3, c1, c2); 845 sqr_add_c2(a, 6, 5, c3, c1, c2); 846 r[11] = c3; 847 c3 = 0; 848 sqr_add_c(a, 6, c1, c2, c3); 849 sqr_add_c2(a, 7, 5, c1, c2, c3); 850 r[12] = c1; 851 c1 = 0; 852 sqr_add_c2(a, 7, 6, c2, c3, c1); 853 r[13] = c2; 854 c2 = 0; 855 sqr_add_c(a, 7, c3, c1, c2); 856 r[14] = c3; 857 r[15] = c1; 858 } 859 860 void 861 bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 862 { 863 BN_ULONG c1, c2, c3; 864 865 c1 = 0; 866 c2 = 0; 867 c3 = 0; 868 sqr_add_c(a, 0, c1, c2, c3); 869 r[0] = c1; 870 c1 = 0; 871 sqr_add_c2(a, 1, 0, c2, c3, c1); 872 r[1] = c2; 873 c2 = 0; 874 sqr_add_c(a, 1, c3, c1, c2); 875 sqr_add_c2(a, 2, 0, c3, c1, c2); 876 r[2] = c3; 877 c3 = 0; 878 sqr_add_c2(a, 3, 0, c1, c2, c3); 879 sqr_add_c2(a, 2, 1, c1, c2, c3); 880 r[3] = c1; 881 c1 = 0; 882 sqr_add_c(a, 2, c2, c3, c1); 883 sqr_add_c2(a, 3, 1, c2, c3, c1); 884 r[4] = c2; 885 c2 = 0; 886 sqr_add_c2(a, 3, 2, c3, c1, c2); 887 r[5] = c3; 888 c3 = 0; 889 sqr_add_c(a, 3, c1, c2, c3); 890 r[6] = c1; 891 r[7] = c2; 892 } 893 894 #ifdef OPENSSL_NO_ASM 895 #ifdef OPENSSL_BN_ASM_MONT 896 /* 897 * This is essentially reference implementation, which may or may not 898 * result in performance improvement. E.g. on IA-32 this routine was 899 * observed to give 40% faster rsa1024 private key operations and 10% 900 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only 901 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a 902 * reference implementation, one to be used as starting point for 903 * platform-specific assembler. Mentioned numbers apply to compiler 904 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and 905 * can vary not only from platform to platform, but even for compiler 906 * versions. Assembler vs. assembler improvement coefficients can 907 * [and are known to] differ and are to be documented elsewhere. 908 */ 909 int 910 bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0p, int num) 911 { 912 BN_ULONG c0, c1, ml, *tp, n0; 913 #ifdef mul64 914 BN_ULONG mh; 915 #endif 916 int i = 0, j; 917 918 #if 0 /* template for platform-specific implementation */ 919 if (ap == bp) 920 return bn_sqr_mont(rp, ap, np, n0p, num); 921 #endif 922 tp = reallocarray(NULL, num + 2, sizeof(BN_ULONG)); 923 if (tp == NULL) 924 return 0; 925 926 n0 = *n0p; 927 928 c0 = 0; 929 ml = bp[0]; 930 #ifdef mul64 931 mh = HBITS(ml); 932 ml = LBITS(ml); 933 for (j = 0; j < num; ++j) 934 mul(tp[j], ap[j], ml, mh, c0); 935 #else 936 for (j = 0; j < num; ++j) 937 mul(tp[j], ap[j], ml, c0); 938 #endif 939 940 tp[num] = c0; 941 tp[num + 1] = 0; 942 goto enter; 943 944 for (i = 0; i < num; i++) { 945 c0 = 0; 946 ml = bp[i]; 947 #ifdef mul64 948 mh = HBITS(ml); 949 ml = LBITS(ml); 950 for (j = 0; j < num; ++j) 951 mul_add(tp[j], ap[j], ml, mh, c0); 952 #else 953 for (j = 0; j < num; ++j) 954 mul_add(tp[j], ap[j], ml, c0); 955 #endif 956 c1 = (tp[num] + c0) & BN_MASK2; 957 tp[num] = c1; 958 tp[num + 1] = (c1 < c0 ? 1 : 0); 959 enter: 960 c1 = tp[0]; 961 ml = (c1 * n0) & BN_MASK2; 962 c0 = 0; 963 #ifdef mul64 964 mh = HBITS(ml); 965 ml = LBITS(ml); 966 mul_add(c1, np[0], ml, mh, c0); 967 #else 968 mul_add(c1, ml, np[0], c0); 969 #endif 970 for (j = 1; j < num; j++) { 971 c1 = tp[j]; 972 #ifdef mul64 973 mul_add(c1, np[j], ml, mh, c0); 974 #else 975 mul_add(c1, ml, np[j], c0); 976 #endif 977 tp[j - 1] = c1 & BN_MASK2; 978 } 979 c1 = (tp[num] + c0) & BN_MASK2; 980 tp[num - 1] = c1; 981 tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); 982 } 983 984 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 985 c0 = bn_sub_words(rp, tp, np, num); 986 if (tp[num] != 0 || c0 == 0) { 987 goto out; 988 } 989 } 990 memcpy(rp, tp, num * sizeof(BN_ULONG)); 991 out: 992 explicit_bzero(tp, (num + 2) * sizeof(BN_ULONG)); 993 free(tp); 994 return 1; 995 } 996 #else 997 /* 998 * Return value of 0 indicates that multiplication/convolution was not 999 * performed to signal the caller to fall down to alternative/original 1000 * code-path. 1001 */ 1002 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num) 1003 { return 0; 1004 } 1005 #endif /* OPENSSL_BN_ASM_MONT */ 1006 #endif 1007 1008 #else /* !BN_MUL_COMBA */ 1009 1010 /* hmm... is it faster just to do a multiply? */ 1011 #undef bn_sqr_comba4 1012 void 1013 bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 1014 { 1015 BN_ULONG t[8]; 1016 bn_sqr_normal(r, a, 4, t); 1017 } 1018 1019 #undef bn_sqr_comba8 1020 void 1021 bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 1022 { 1023 BN_ULONG t[16]; 1024 bn_sqr_normal(r, a, 8, t); 1025 } 1026 1027 void 1028 bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1029 { 1030 r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); 1031 r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); 1032 r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); 1033 r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); 1034 } 1035 1036 void 1037 bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1038 { 1039 r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); 1040 r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); 1041 r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); 1042 r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); 1043 r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); 1044 r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); 1045 r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); 1046 r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); 1047 } 1048 1049 #ifdef OPENSSL_NO_ASM 1050 #ifdef OPENSSL_BN_ASM_MONT 1051 int 1052 bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1053 const BN_ULONG *np, const BN_ULONG *n0p, int num) 1054 { 1055 BN_ULONG c0, c1, *tp, n0 = *n0p; 1056 int i = 0, j; 1057 1058 tp = calloc(NULL, num + 2, sizeof(BN_ULONG)); 1059 if (tp == NULL) 1060 return 0; 1061 1062 for (i = 0; i < num; i++) { 1063 c0 = bn_mul_add_words(tp, ap, num, bp[i]); 1064 c1 = (tp[num] + c0) & BN_MASK2; 1065 tp[num] = c1; 1066 tp[num + 1] = (c1 < c0 ? 1 : 0); 1067 1068 c0 = bn_mul_add_words(tp, np, num, tp[0] * n0); 1069 c1 = (tp[num] + c0) & BN_MASK2; 1070 tp[num] = c1; 1071 tp[num + 1] += (c1 < c0 ? 1 : 0); 1072 for (j = 0; j <= num; j++) 1073 tp[j] = tp[j + 1]; 1074 } 1075 1076 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 1077 c0 = bn_sub_words(rp, tp, np, num); 1078 if (tp[num] != 0 || c0 == 0) { 1079 goto out; 1080 } 1081 } 1082 memcpy(rp, tp, num * sizeof(BN_ULONG)); 1083 out: 1084 explicit_bzero(tp, (num + 2) * sizeof(BN_ULONG)); 1085 free(tp); 1086 return 1; 1087 } 1088 #else 1089 int 1090 bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1091 const BN_ULONG *np, const BN_ULONG *n0, int num) 1092 { 1093 return 0; 1094 } 1095 #endif /* OPENSSL_BN_ASM_MONT */ 1096 #endif 1097 1098 #endif /* !BN_MUL_COMBA */ 1099