1 /* crypto/bn/bn_asm.c */ 2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 3 * All rights reserved. 4 * 5 * This package is an SSL implementation written 6 * by Eric Young (eay@cryptsoft.com). 7 * The implementation was written so as to conform with Netscapes SSL. 8 * 9 * This library is free for commercial and non-commercial use as long as 10 * the following conditions are aheared to. The following conditions 11 * apply to all code found in this distribution, be it the RC4, RSA, 12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation 13 * included with this distribution is covered by the same copyright terms 14 * except that the holder is Tim Hudson (tjh@cryptsoft.com). 15 * 16 * Copyright remains Eric Young's, and as such any Copyright notices in 17 * the code are not to be removed. 18 * If this package is used in a product, Eric Young should be given attribution 19 * as the author of the parts of the library used. 20 * This can be in the form of a textual message at program startup or 21 * in documentation (online or textual) provided with the package. 22 * 23 * Redistribution and use in source and binary forms, with or without 24 * modification, are permitted provided that the following conditions 25 * are met: 26 * 1. Redistributions of source code must retain the copyright 27 * notice, this list of conditions and the following disclaimer. 28 * 2. Redistributions in binary form must reproduce the above copyright 29 * notice, this list of conditions and the following disclaimer in the 30 * documentation and/or other materials provided with the distribution. 31 * 3. All advertising materials mentioning features or use of this software 32 * must display the following acknowledgement: 33 * "This product includes cryptographic software written by 34 * Eric Young (eay@cryptsoft.com)" 35 * The word 'cryptographic' can be left out if the rouines from the library 36 * being used are not cryptographic related :-). 37 * 4. If you include any Windows specific code (or a derivative thereof) from 38 * the apps directory (application code) you must include an acknowledgement: 39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" 40 * 41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND 42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 51 * SUCH DAMAGE. 52 * 53 * The licence and distribution terms for any publically available version or 54 * derivative of this code cannot be changed. i.e. this code cannot simply be 55 * copied and put under another distribution licence 56 * [including the GNU Public Licence.] 57 */ 58 59 #ifndef BN_DEBUG 60 # undef NDEBUG /* avoid conflicting definitions */ 61 # define NDEBUG 62 #endif 63 64 #include <stdio.h> 65 #include <assert.h> 66 #include "cryptlib.h" 67 #include "bn_lcl.h" 68 69 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) 70 71 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 72 BN_ULONG w) 73 { 74 BN_ULONG c1 = 0; 75 76 assert(num >= 0); 77 if (num <= 0) 78 return (c1); 79 80 # ifndef OPENSSL_SMALL_FOOTPRINT 81 while (num & ~3) { 82 mul_add(rp[0], ap[0], w, c1); 83 mul_add(rp[1], ap[1], w, c1); 84 mul_add(rp[2], ap[2], w, c1); 85 mul_add(rp[3], ap[3], w, c1); 86 ap += 4; 87 rp += 4; 88 num -= 4; 89 } 90 # endif 91 while (num) { 92 mul_add(rp[0], ap[0], w, c1); 93 ap++; 94 rp++; 95 num--; 96 } 97 98 return (c1); 99 } 100 101 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 102 { 103 BN_ULONG c1 = 0; 104 105 assert(num >= 0); 106 if (num <= 0) 107 return (c1); 108 109 # ifndef OPENSSL_SMALL_FOOTPRINT 110 while (num & ~3) { 111 mul(rp[0], ap[0], w, c1); 112 mul(rp[1], ap[1], w, c1); 113 mul(rp[2], ap[2], w, c1); 114 mul(rp[3], ap[3], w, c1); 115 ap += 4; 116 rp += 4; 117 num -= 4; 118 } 119 # endif 120 while (num) { 121 mul(rp[0], ap[0], w, c1); 122 ap++; 123 rp++; 124 num--; 125 } 126 return (c1); 127 } 128 129 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 130 { 131 assert(n >= 0); 132 if (n <= 0) 133 return; 134 135 # ifndef OPENSSL_SMALL_FOOTPRINT 136 while (n & ~3) { 137 sqr(r[0], r[1], a[0]); 138 sqr(r[2], r[3], a[1]); 139 sqr(r[4], r[5], a[2]); 140 sqr(r[6], r[7], a[3]); 141 a += 4; 142 r += 8; 143 n -= 4; 144 } 145 # endif 146 while (n) { 147 sqr(r[0], r[1], a[0]); 148 a++; 149 r += 2; 150 n--; 151 } 152 } 153 154 #else /* !(defined(BN_LLONG) || 155 * defined(BN_UMULT_HIGH)) */ 156 157 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 158 BN_ULONG w) 159 { 160 BN_ULONG c = 0; 161 BN_ULONG bl, bh; 162 163 assert(num >= 0); 164 if (num <= 0) 165 return ((BN_ULONG)0); 166 167 bl = LBITS(w); 168 bh = HBITS(w); 169 170 # ifndef OPENSSL_SMALL_FOOTPRINT 171 while (num & ~3) { 172 mul_add(rp[0], ap[0], bl, bh, c); 173 mul_add(rp[1], ap[1], bl, bh, c); 174 mul_add(rp[2], ap[2], bl, bh, c); 175 mul_add(rp[3], ap[3], bl, bh, c); 176 ap += 4; 177 rp += 4; 178 num -= 4; 179 } 180 # endif 181 while (num) { 182 mul_add(rp[0], ap[0], bl, bh, c); 183 ap++; 184 rp++; 185 num--; 186 } 187 return (c); 188 } 189 190 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 191 { 192 BN_ULONG carry = 0; 193 BN_ULONG bl, bh; 194 195 assert(num >= 0); 196 if (num <= 0) 197 return ((BN_ULONG)0); 198 199 bl = LBITS(w); 200 bh = HBITS(w); 201 202 # ifndef OPENSSL_SMALL_FOOTPRINT 203 while (num & ~3) { 204 mul(rp[0], ap[0], bl, bh, carry); 205 mul(rp[1], ap[1], bl, bh, carry); 206 mul(rp[2], ap[2], bl, bh, carry); 207 mul(rp[3], ap[3], bl, bh, carry); 208 ap += 4; 209 rp += 4; 210 num -= 4; 211 } 212 # endif 213 while (num) { 214 mul(rp[0], ap[0], bl, bh, carry); 215 ap++; 216 rp++; 217 num--; 218 } 219 return (carry); 220 } 221 222 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 223 { 224 assert(n >= 0); 225 if (n <= 0) 226 return; 227 228 # ifndef OPENSSL_SMALL_FOOTPRINT 229 while (n & ~3) { 230 sqr64(r[0], r[1], a[0]); 231 sqr64(r[2], r[3], a[1]); 232 sqr64(r[4], r[5], a[2]); 233 sqr64(r[6], r[7], a[3]); 234 a += 4; 235 r += 8; 236 n -= 4; 237 } 238 # endif 239 while (n) { 240 sqr64(r[0], r[1], a[0]); 241 a++; 242 r += 2; 243 n--; 244 } 245 } 246 247 #endif /* !(defined(BN_LLONG) || 248 * defined(BN_UMULT_HIGH)) */ 249 250 #if defined(BN_LLONG) && defined(BN_DIV2W) 251 252 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 253 { 254 return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d)); 255 } 256 257 #else 258 259 /* Divide h,l by d and return the result. */ 260 /* I need to test this some more :-( */ 261 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 262 { 263 BN_ULONG dh, dl, q, ret = 0, th, tl, t; 264 int i, count = 2; 265 266 if (d == 0) 267 return (BN_MASK2); 268 269 i = BN_num_bits_word(d); 270 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); 271 272 i = BN_BITS2 - i; 273 if (h >= d) 274 h -= d; 275 276 if (i) { 277 d <<= i; 278 h = (h << i) | (l >> (BN_BITS2 - i)); 279 l <<= i; 280 } 281 dh = (d & BN_MASK2h) >> BN_BITS4; 282 dl = (d & BN_MASK2l); 283 for (;;) { 284 if ((h >> BN_BITS4) == dh) 285 q = BN_MASK2l; 286 else 287 q = h / dh; 288 289 th = q * dh; 290 tl = dl * q; 291 for (;;) { 292 t = h - th; 293 if ((t & BN_MASK2h) || 294 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) 295 break; 296 q--; 297 th -= dh; 298 tl -= dl; 299 } 300 t = (tl >> BN_BITS4); 301 tl = (tl << BN_BITS4) & BN_MASK2h; 302 th += t; 303 304 if (l < tl) 305 th++; 306 l -= tl; 307 if (h < th) { 308 h += d; 309 q--; 310 } 311 h -= th; 312 313 if (--count == 0) 314 break; 315 316 ret = q << BN_BITS4; 317 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; 318 l = (l & BN_MASK2l) << BN_BITS4; 319 } 320 ret |= q; 321 return (ret); 322 } 323 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ 324 325 #ifdef BN_LLONG 326 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 327 int n) 328 { 329 BN_ULLONG ll = 0; 330 331 assert(n >= 0); 332 if (n <= 0) 333 return ((BN_ULONG)0); 334 335 # ifndef OPENSSL_SMALL_FOOTPRINT 336 while (n & ~3) { 337 ll += (BN_ULLONG) a[0] + b[0]; 338 r[0] = (BN_ULONG)ll & BN_MASK2; 339 ll >>= BN_BITS2; 340 ll += (BN_ULLONG) a[1] + b[1]; 341 r[1] = (BN_ULONG)ll & BN_MASK2; 342 ll >>= BN_BITS2; 343 ll += (BN_ULLONG) a[2] + b[2]; 344 r[2] = (BN_ULONG)ll & BN_MASK2; 345 ll >>= BN_BITS2; 346 ll += (BN_ULLONG) a[3] + b[3]; 347 r[3] = (BN_ULONG)ll & BN_MASK2; 348 ll >>= BN_BITS2; 349 a += 4; 350 b += 4; 351 r += 4; 352 n -= 4; 353 } 354 # endif 355 while (n) { 356 ll += (BN_ULLONG) a[0] + b[0]; 357 r[0] = (BN_ULONG)ll & BN_MASK2; 358 ll >>= BN_BITS2; 359 a++; 360 b++; 361 r++; 362 n--; 363 } 364 return ((BN_ULONG)ll); 365 } 366 #else /* !BN_LLONG */ 367 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 368 int n) 369 { 370 BN_ULONG c, l, t; 371 372 assert(n >= 0); 373 if (n <= 0) 374 return ((BN_ULONG)0); 375 376 c = 0; 377 # ifndef OPENSSL_SMALL_FOOTPRINT 378 while (n & ~3) { 379 t = a[0]; 380 t = (t + c) & BN_MASK2; 381 c = (t < c); 382 l = (t + b[0]) & BN_MASK2; 383 c += (l < t); 384 r[0] = l; 385 t = a[1]; 386 t = (t + c) & BN_MASK2; 387 c = (t < c); 388 l = (t + b[1]) & BN_MASK2; 389 c += (l < t); 390 r[1] = l; 391 t = a[2]; 392 t = (t + c) & BN_MASK2; 393 c = (t < c); 394 l = (t + b[2]) & BN_MASK2; 395 c += (l < t); 396 r[2] = l; 397 t = a[3]; 398 t = (t + c) & BN_MASK2; 399 c = (t < c); 400 l = (t + b[3]) & BN_MASK2; 401 c += (l < t); 402 r[3] = l; 403 a += 4; 404 b += 4; 405 r += 4; 406 n -= 4; 407 } 408 # endif 409 while (n) { 410 t = a[0]; 411 t = (t + c) & BN_MASK2; 412 c = (t < c); 413 l = (t + b[0]) & BN_MASK2; 414 c += (l < t); 415 r[0] = l; 416 a++; 417 b++; 418 r++; 419 n--; 420 } 421 return ((BN_ULONG)c); 422 } 423 #endif /* !BN_LLONG */ 424 425 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 426 int n) 427 { 428 BN_ULONG t1, t2; 429 int c = 0; 430 431 assert(n >= 0); 432 if (n <= 0) 433 return ((BN_ULONG)0); 434 435 #ifndef OPENSSL_SMALL_FOOTPRINT 436 while (n & ~3) { 437 t1 = a[0]; 438 t2 = b[0]; 439 r[0] = (t1 - t2 - c) & BN_MASK2; 440 if (t1 != t2) 441 c = (t1 < t2); 442 t1 = a[1]; 443 t2 = b[1]; 444 r[1] = (t1 - t2 - c) & BN_MASK2; 445 if (t1 != t2) 446 c = (t1 < t2); 447 t1 = a[2]; 448 t2 = b[2]; 449 r[2] = (t1 - t2 - c) & BN_MASK2; 450 if (t1 != t2) 451 c = (t1 < t2); 452 t1 = a[3]; 453 t2 = b[3]; 454 r[3] = (t1 - t2 - c) & BN_MASK2; 455 if (t1 != t2) 456 c = (t1 < t2); 457 a += 4; 458 b += 4; 459 r += 4; 460 n -= 4; 461 } 462 #endif 463 while (n) { 464 t1 = a[0]; 465 t2 = b[0]; 466 r[0] = (t1 - t2 - c) & BN_MASK2; 467 if (t1 != t2) 468 c = (t1 < t2); 469 a++; 470 b++; 471 r++; 472 n--; 473 } 474 return (c); 475 } 476 477 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) 478 479 # undef bn_mul_comba8 480 # undef bn_mul_comba4 481 # undef bn_sqr_comba8 482 # undef bn_sqr_comba4 483 484 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 485 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 486 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 487 /* 488 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 489 * c=(c2,c1,c0) 490 */ 491 492 # ifdef BN_LLONG 493 /* 494 * Keep in mind that additions to multiplication result can not 495 * overflow, because its high half cannot be all-ones. 496 */ 497 # define mul_add_c(a,b,c0,c1,c2) do { \ 498 BN_ULONG hi; \ 499 BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 500 t += c0; /* no carry */ \ 501 c0 = (BN_ULONG)Lw(t); \ 502 hi = (BN_ULONG)Hw(t); \ 503 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 504 } while(0) 505 506 # define mul_add_c2(a,b,c0,c1,c2) do { \ 507 BN_ULONG hi; \ 508 BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 509 BN_ULLONG tt = t+c0; /* no carry */ \ 510 c0 = (BN_ULONG)Lw(tt); \ 511 hi = (BN_ULONG)Hw(tt); \ 512 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 513 t += c0; /* no carry */ \ 514 c0 = (BN_ULONG)Lw(t); \ 515 hi = (BN_ULONG)Hw(t); \ 516 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 517 } while(0) 518 519 # define sqr_add_c(a,i,c0,c1,c2) do { \ 520 BN_ULONG hi; \ 521 BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \ 522 t += c0; /* no carry */ \ 523 c0 = (BN_ULONG)Lw(t); \ 524 hi = (BN_ULONG)Hw(t); \ 525 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 526 } while(0) 527 528 # define sqr_add_c2(a,i,j,c0,c1,c2) \ 529 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 530 531 # elif defined(BN_UMULT_LOHI) 532 /* 533 * Keep in mind that additions to hi can not overflow, because 534 * the high word of a multiplication result cannot be all-ones. 535 */ 536 # define mul_add_c(a,b,c0,c1,c2) do { \ 537 BN_ULONG ta = (a), tb = (b); \ 538 BN_ULONG lo, hi; \ 539 BN_UMULT_LOHI(lo,hi,ta,tb); \ 540 c0 += lo; hi += (c0<lo)?1:0; \ 541 c1 += hi; c2 += (c1<hi)?1:0; \ 542 } while(0) 543 544 # define mul_add_c2(a,b,c0,c1,c2) do { \ 545 BN_ULONG ta = (a), tb = (b); \ 546 BN_ULONG lo, hi, tt; \ 547 BN_UMULT_LOHI(lo,hi,ta,tb); \ 548 c0 += lo; tt = hi+((c0<lo)?1:0); \ 549 c1 += tt; c2 += (c1<tt)?1:0; \ 550 c0 += lo; hi += (c0<lo)?1:0; \ 551 c1 += hi; c2 += (c1<hi)?1:0; \ 552 } while(0) 553 554 # define sqr_add_c(a,i,c0,c1,c2) do { \ 555 BN_ULONG ta = (a)[i]; \ 556 BN_ULONG lo, hi; \ 557 BN_UMULT_LOHI(lo,hi,ta,ta); \ 558 c0 += lo; hi += (c0<lo)?1:0; \ 559 c1 += hi; c2 += (c1<hi)?1:0; \ 560 } while(0) 561 562 # define sqr_add_c2(a,i,j,c0,c1,c2) \ 563 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 564 565 # elif defined(BN_UMULT_HIGH) 566 /* 567 * Keep in mind that additions to hi can not overflow, because 568 * the high word of a multiplication result cannot be all-ones. 569 */ 570 # define mul_add_c(a,b,c0,c1,c2) do { \ 571 BN_ULONG ta = (a), tb = (b); \ 572 BN_ULONG lo = ta * tb; \ 573 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 574 c0 += lo; hi += (c0<lo)?1:0; \ 575 c1 += hi; c2 += (c1<hi)?1:0; \ 576 } while(0) 577 578 # define mul_add_c2(a,b,c0,c1,c2) do { \ 579 BN_ULONG ta = (a), tb = (b), tt; \ 580 BN_ULONG lo = ta * tb; \ 581 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 582 c0 += lo; tt = hi + ((c0<lo)?1:0); \ 583 c1 += tt; c2 += (c1<tt)?1:0; \ 584 c0 += lo; hi += (c0<lo)?1:0; \ 585 c1 += hi; c2 += (c1<hi)?1:0; \ 586 } while(0) 587 588 # define sqr_add_c(a,i,c0,c1,c2) do { \ 589 BN_ULONG ta = (a)[i]; \ 590 BN_ULONG lo = ta * ta; \ 591 BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \ 592 c0 += lo; hi += (c0<lo)?1:0; \ 593 c1 += hi; c2 += (c1<hi)?1:0; \ 594 } while(0) 595 596 # define sqr_add_c2(a,i,j,c0,c1,c2) \ 597 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 598 599 # else /* !BN_LLONG */ 600 /* 601 * Keep in mind that additions to hi can not overflow, because 602 * the high word of a multiplication result cannot be all-ones. 603 */ 604 # define mul_add_c(a,b,c0,c1,c2) do { \ 605 BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 606 BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 607 mul64(lo,hi,bl,bh); \ 608 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 609 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 610 } while(0) 611 612 # define mul_add_c2(a,b,c0,c1,c2) do { \ 613 BN_ULONG tt; \ 614 BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 615 BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 616 mul64(lo,hi,bl,bh); \ 617 tt = hi; \ 618 c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \ 619 c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \ 620 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 621 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 622 } while(0) 623 624 # define sqr_add_c(a,i,c0,c1,c2) do { \ 625 BN_ULONG lo, hi; \ 626 sqr64(lo,hi,(a)[i]); \ 627 c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 628 c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 629 } while(0) 630 631 # define sqr_add_c2(a,i,j,c0,c1,c2) \ 632 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 633 # endif /* !BN_LLONG */ 634 635 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 636 { 637 BN_ULONG c1, c2, c3; 638 639 c1 = 0; 640 c2 = 0; 641 c3 = 0; 642 mul_add_c(a[0], b[0], c1, c2, c3); 643 r[0] = c1; 644 c1 = 0; 645 mul_add_c(a[0], b[1], c2, c3, c1); 646 mul_add_c(a[1], b[0], c2, c3, c1); 647 r[1] = c2; 648 c2 = 0; 649 mul_add_c(a[2], b[0], c3, c1, c2); 650 mul_add_c(a[1], b[1], c3, c1, c2); 651 mul_add_c(a[0], b[2], c3, c1, c2); 652 r[2] = c3; 653 c3 = 0; 654 mul_add_c(a[0], b[3], c1, c2, c3); 655 mul_add_c(a[1], b[2], c1, c2, c3); 656 mul_add_c(a[2], b[1], c1, c2, c3); 657 mul_add_c(a[3], b[0], c1, c2, c3); 658 r[3] = c1; 659 c1 = 0; 660 mul_add_c(a[4], b[0], c2, c3, c1); 661 mul_add_c(a[3], b[1], c2, c3, c1); 662 mul_add_c(a[2], b[2], c2, c3, c1); 663 mul_add_c(a[1], b[3], c2, c3, c1); 664 mul_add_c(a[0], b[4], c2, c3, c1); 665 r[4] = c2; 666 c2 = 0; 667 mul_add_c(a[0], b[5], c3, c1, c2); 668 mul_add_c(a[1], b[4], c3, c1, c2); 669 mul_add_c(a[2], b[3], c3, c1, c2); 670 mul_add_c(a[3], b[2], c3, c1, c2); 671 mul_add_c(a[4], b[1], c3, c1, c2); 672 mul_add_c(a[5], b[0], c3, c1, c2); 673 r[5] = c3; 674 c3 = 0; 675 mul_add_c(a[6], b[0], c1, c2, c3); 676 mul_add_c(a[5], b[1], c1, c2, c3); 677 mul_add_c(a[4], b[2], c1, c2, c3); 678 mul_add_c(a[3], b[3], c1, c2, c3); 679 mul_add_c(a[2], b[4], c1, c2, c3); 680 mul_add_c(a[1], b[5], c1, c2, c3); 681 mul_add_c(a[0], b[6], c1, c2, c3); 682 r[6] = c1; 683 c1 = 0; 684 mul_add_c(a[0], b[7], c2, c3, c1); 685 mul_add_c(a[1], b[6], c2, c3, c1); 686 mul_add_c(a[2], b[5], c2, c3, c1); 687 mul_add_c(a[3], b[4], c2, c3, c1); 688 mul_add_c(a[4], b[3], c2, c3, c1); 689 mul_add_c(a[5], b[2], c2, c3, c1); 690 mul_add_c(a[6], b[1], c2, c3, c1); 691 mul_add_c(a[7], b[0], c2, c3, c1); 692 r[7] = c2; 693 c2 = 0; 694 mul_add_c(a[7], b[1], c3, c1, c2); 695 mul_add_c(a[6], b[2], c3, c1, c2); 696 mul_add_c(a[5], b[3], c3, c1, c2); 697 mul_add_c(a[4], b[4], c3, c1, c2); 698 mul_add_c(a[3], b[5], c3, c1, c2); 699 mul_add_c(a[2], b[6], c3, c1, c2); 700 mul_add_c(a[1], b[7], c3, c1, c2); 701 r[8] = c3; 702 c3 = 0; 703 mul_add_c(a[2], b[7], c1, c2, c3); 704 mul_add_c(a[3], b[6], c1, c2, c3); 705 mul_add_c(a[4], b[5], c1, c2, c3); 706 mul_add_c(a[5], b[4], c1, c2, c3); 707 mul_add_c(a[6], b[3], c1, c2, c3); 708 mul_add_c(a[7], b[2], c1, c2, c3); 709 r[9] = c1; 710 c1 = 0; 711 mul_add_c(a[7], b[3], c2, c3, c1); 712 mul_add_c(a[6], b[4], c2, c3, c1); 713 mul_add_c(a[5], b[5], c2, c3, c1); 714 mul_add_c(a[4], b[6], c2, c3, c1); 715 mul_add_c(a[3], b[7], c2, c3, c1); 716 r[10] = c2; 717 c2 = 0; 718 mul_add_c(a[4], b[7], c3, c1, c2); 719 mul_add_c(a[5], b[6], c3, c1, c2); 720 mul_add_c(a[6], b[5], c3, c1, c2); 721 mul_add_c(a[7], b[4], c3, c1, c2); 722 r[11] = c3; 723 c3 = 0; 724 mul_add_c(a[7], b[5], c1, c2, c3); 725 mul_add_c(a[6], b[6], c1, c2, c3); 726 mul_add_c(a[5], b[7], c1, c2, c3); 727 r[12] = c1; 728 c1 = 0; 729 mul_add_c(a[6], b[7], c2, c3, c1); 730 mul_add_c(a[7], b[6], c2, c3, c1); 731 r[13] = c2; 732 c2 = 0; 733 mul_add_c(a[7], b[7], c3, c1, c2); 734 r[14] = c3; 735 r[15] = c1; 736 } 737 738 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 739 { 740 BN_ULONG c1, c2, c3; 741 742 c1 = 0; 743 c2 = 0; 744 c3 = 0; 745 mul_add_c(a[0], b[0], c1, c2, c3); 746 r[0] = c1; 747 c1 = 0; 748 mul_add_c(a[0], b[1], c2, c3, c1); 749 mul_add_c(a[1], b[0], c2, c3, c1); 750 r[1] = c2; 751 c2 = 0; 752 mul_add_c(a[2], b[0], c3, c1, c2); 753 mul_add_c(a[1], b[1], c3, c1, c2); 754 mul_add_c(a[0], b[2], c3, c1, c2); 755 r[2] = c3; 756 c3 = 0; 757 mul_add_c(a[0], b[3], c1, c2, c3); 758 mul_add_c(a[1], b[2], c1, c2, c3); 759 mul_add_c(a[2], b[1], c1, c2, c3); 760 mul_add_c(a[3], b[0], c1, c2, c3); 761 r[3] = c1; 762 c1 = 0; 763 mul_add_c(a[3], b[1], c2, c3, c1); 764 mul_add_c(a[2], b[2], c2, c3, c1); 765 mul_add_c(a[1], b[3], c2, c3, c1); 766 r[4] = c2; 767 c2 = 0; 768 mul_add_c(a[2], b[3], c3, c1, c2); 769 mul_add_c(a[3], b[2], c3, c1, c2); 770 r[5] = c3; 771 c3 = 0; 772 mul_add_c(a[3], b[3], c1, c2, c3); 773 r[6] = c1; 774 r[7] = c2; 775 } 776 777 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 778 { 779 BN_ULONG c1, c2, c3; 780 781 c1 = 0; 782 c2 = 0; 783 c3 = 0; 784 sqr_add_c(a, 0, c1, c2, c3); 785 r[0] = c1; 786 c1 = 0; 787 sqr_add_c2(a, 1, 0, c2, c3, c1); 788 r[1] = c2; 789 c2 = 0; 790 sqr_add_c(a, 1, c3, c1, c2); 791 sqr_add_c2(a, 2, 0, c3, c1, c2); 792 r[2] = c3; 793 c3 = 0; 794 sqr_add_c2(a, 3, 0, c1, c2, c3); 795 sqr_add_c2(a, 2, 1, c1, c2, c3); 796 r[3] = c1; 797 c1 = 0; 798 sqr_add_c(a, 2, c2, c3, c1); 799 sqr_add_c2(a, 3, 1, c2, c3, c1); 800 sqr_add_c2(a, 4, 0, c2, c3, c1); 801 r[4] = c2; 802 c2 = 0; 803 sqr_add_c2(a, 5, 0, c3, c1, c2); 804 sqr_add_c2(a, 4, 1, c3, c1, c2); 805 sqr_add_c2(a, 3, 2, c3, c1, c2); 806 r[5] = c3; 807 c3 = 0; 808 sqr_add_c(a, 3, c1, c2, c3); 809 sqr_add_c2(a, 4, 2, c1, c2, c3); 810 sqr_add_c2(a, 5, 1, c1, c2, c3); 811 sqr_add_c2(a, 6, 0, c1, c2, c3); 812 r[6] = c1; 813 c1 = 0; 814 sqr_add_c2(a, 7, 0, c2, c3, c1); 815 sqr_add_c2(a, 6, 1, c2, c3, c1); 816 sqr_add_c2(a, 5, 2, c2, c3, c1); 817 sqr_add_c2(a, 4, 3, c2, c3, c1); 818 r[7] = c2; 819 c2 = 0; 820 sqr_add_c(a, 4, c3, c1, c2); 821 sqr_add_c2(a, 5, 3, c3, c1, c2); 822 sqr_add_c2(a, 6, 2, c3, c1, c2); 823 sqr_add_c2(a, 7, 1, c3, c1, c2); 824 r[8] = c3; 825 c3 = 0; 826 sqr_add_c2(a, 7, 2, c1, c2, c3); 827 sqr_add_c2(a, 6, 3, c1, c2, c3); 828 sqr_add_c2(a, 5, 4, c1, c2, c3); 829 r[9] = c1; 830 c1 = 0; 831 sqr_add_c(a, 5, c2, c3, c1); 832 sqr_add_c2(a, 6, 4, c2, c3, c1); 833 sqr_add_c2(a, 7, 3, c2, c3, c1); 834 r[10] = c2; 835 c2 = 0; 836 sqr_add_c2(a, 7, 4, c3, c1, c2); 837 sqr_add_c2(a, 6, 5, c3, c1, c2); 838 r[11] = c3; 839 c3 = 0; 840 sqr_add_c(a, 6, c1, c2, c3); 841 sqr_add_c2(a, 7, 5, c1, c2, c3); 842 r[12] = c1; 843 c1 = 0; 844 sqr_add_c2(a, 7, 6, c2, c3, c1); 845 r[13] = c2; 846 c2 = 0; 847 sqr_add_c(a, 7, c3, c1, c2); 848 r[14] = c3; 849 r[15] = c1; 850 } 851 852 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 853 { 854 BN_ULONG c1, c2, c3; 855 856 c1 = 0; 857 c2 = 0; 858 c3 = 0; 859 sqr_add_c(a, 0, c1, c2, c3); 860 r[0] = c1; 861 c1 = 0; 862 sqr_add_c2(a, 1, 0, c2, c3, c1); 863 r[1] = c2; 864 c2 = 0; 865 sqr_add_c(a, 1, c3, c1, c2); 866 sqr_add_c2(a, 2, 0, c3, c1, c2); 867 r[2] = c3; 868 c3 = 0; 869 sqr_add_c2(a, 3, 0, c1, c2, c3); 870 sqr_add_c2(a, 2, 1, c1, c2, c3); 871 r[3] = c1; 872 c1 = 0; 873 sqr_add_c(a, 2, c2, c3, c1); 874 sqr_add_c2(a, 3, 1, c2, c3, c1); 875 r[4] = c2; 876 c2 = 0; 877 sqr_add_c2(a, 3, 2, c3, c1, c2); 878 r[5] = c3; 879 c3 = 0; 880 sqr_add_c(a, 3, c1, c2, c3); 881 r[6] = c1; 882 r[7] = c2; 883 } 884 885 # ifdef OPENSSL_NO_ASM 886 # ifdef OPENSSL_BN_ASM_MONT 887 # include <alloca.h> 888 /* 889 * This is essentially reference implementation, which may or may not 890 * result in performance improvement. E.g. on IA-32 this routine was 891 * observed to give 40% faster rsa1024 private key operations and 10% 892 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only 893 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a 894 * reference implementation, one to be used as starting point for 895 * platform-specific assembler. Mentioned numbers apply to compiler 896 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and 897 * can vary not only from platform to platform, but even for compiler 898 * versions. Assembler vs. assembler improvement coefficients can 899 * [and are known to] differ and are to be documented elsewhere. 900 */ 901 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 902 const BN_ULONG *np, const BN_ULONG *n0p, int num) 903 { 904 BN_ULONG c0, c1, ml, *tp, n0; 905 # ifdef mul64 906 BN_ULONG mh; 907 # endif 908 volatile BN_ULONG *vp; 909 int i = 0, j; 910 911 # if 0 /* template for platform-specific 912 * implementation */ 913 if (ap == bp) 914 return bn_sqr_mont(rp, ap, np, n0p, num); 915 # endif 916 vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 917 918 n0 = *n0p; 919 920 c0 = 0; 921 ml = bp[0]; 922 # ifdef mul64 923 mh = HBITS(ml); 924 ml = LBITS(ml); 925 for (j = 0; j < num; ++j) 926 mul(tp[j], ap[j], ml, mh, c0); 927 # else 928 for (j = 0; j < num; ++j) 929 mul(tp[j], ap[j], ml, c0); 930 # endif 931 932 tp[num] = c0; 933 tp[num + 1] = 0; 934 goto enter; 935 936 for (i = 0; i < num; i++) { 937 c0 = 0; 938 ml = bp[i]; 939 # ifdef mul64 940 mh = HBITS(ml); 941 ml = LBITS(ml); 942 for (j = 0; j < num; ++j) 943 mul_add(tp[j], ap[j], ml, mh, c0); 944 # else 945 for (j = 0; j < num; ++j) 946 mul_add(tp[j], ap[j], ml, c0); 947 # endif 948 c1 = (tp[num] + c0) & BN_MASK2; 949 tp[num] = c1; 950 tp[num + 1] = (c1 < c0 ? 1 : 0); 951 enter: 952 c1 = tp[0]; 953 ml = (c1 * n0) & BN_MASK2; 954 c0 = 0; 955 # ifdef mul64 956 mh = HBITS(ml); 957 ml = LBITS(ml); 958 mul_add(c1, np[0], ml, mh, c0); 959 # else 960 mul_add(c1, ml, np[0], c0); 961 # endif 962 for (j = 1; j < num; j++) { 963 c1 = tp[j]; 964 # ifdef mul64 965 mul_add(c1, np[j], ml, mh, c0); 966 # else 967 mul_add(c1, ml, np[j], c0); 968 # endif 969 tp[j - 1] = c1 & BN_MASK2; 970 } 971 c1 = (tp[num] + c0) & BN_MASK2; 972 tp[num - 1] = c1; 973 tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); 974 } 975 976 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 977 c0 = bn_sub_words(rp, tp, np, num); 978 if (tp[num] != 0 || c0 == 0) { 979 for (i = 0; i < num + 2; i++) 980 vp[i] = 0; 981 return 1; 982 } 983 } 984 for (i = 0; i < num; i++) 985 rp[i] = tp[i], vp[i] = 0; 986 vp[num] = 0; 987 vp[num + 1] = 0; 988 return 1; 989 } 990 # else 991 /* 992 * Return value of 0 indicates that multiplication/convolution was not 993 * performed to signal the caller to fall down to alternative/original 994 * code-path. 995 */ 996 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 997 const BN_ULONG *np, const BN_ULONG *n0, int num) 998 { 999 return 0; 1000 } 1001 # endif /* OPENSSL_BN_ASM_MONT */ 1002 # endif 1003 1004 #else /* !BN_MUL_COMBA */ 1005 1006 /* hmm... is it faster just to do a multiply? */ 1007 # undef bn_sqr_comba4 1008 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 1009 { 1010 BN_ULONG t[8]; 1011 bn_sqr_normal(r, a, 4, t); 1012 } 1013 1014 # undef bn_sqr_comba8 1015 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 1016 { 1017 BN_ULONG t[16]; 1018 bn_sqr_normal(r, a, 8, t); 1019 } 1020 1021 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1022 { 1023 r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); 1024 r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); 1025 r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); 1026 r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); 1027 } 1028 1029 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1030 { 1031 r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); 1032 r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); 1033 r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); 1034 r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); 1035 r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); 1036 r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); 1037 r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); 1038 r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); 1039 } 1040 1041 # ifdef OPENSSL_NO_ASM 1042 # ifdef OPENSSL_BN_ASM_MONT 1043 # include <alloca.h> 1044 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1045 const BN_ULONG *np, const BN_ULONG *n0p, int num) 1046 { 1047 BN_ULONG c0, c1, *tp, n0 = *n0p; 1048 volatile BN_ULONG *vp; 1049 int i = 0, j; 1050 1051 vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 1052 1053 for (i = 0; i <= num; i++) 1054 tp[i] = 0; 1055 1056 for (i = 0; i < num; i++) { 1057 c0 = bn_mul_add_words(tp, ap, num, bp[i]); 1058 c1 = (tp[num] + c0) & BN_MASK2; 1059 tp[num] = c1; 1060 tp[num + 1] = (c1 < c0 ? 1 : 0); 1061 1062 c0 = bn_mul_add_words(tp, np, num, tp[0] * n0); 1063 c1 = (tp[num] + c0) & BN_MASK2; 1064 tp[num] = c1; 1065 tp[num + 1] += (c1 < c0 ? 1 : 0); 1066 for (j = 0; j <= num; j++) 1067 tp[j] = tp[j + 1]; 1068 } 1069 1070 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 1071 c0 = bn_sub_words(rp, tp, np, num); 1072 if (tp[num] != 0 || c0 == 0) { 1073 for (i = 0; i < num + 2; i++) 1074 vp[i] = 0; 1075 return 1; 1076 } 1077 } 1078 for (i = 0; i < num; i++) 1079 rp[i] = tp[i], vp[i] = 0; 1080 vp[num] = 0; 1081 vp[num + 1] = 0; 1082 return 1; 1083 } 1084 # else 1085 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1086 const BN_ULONG *np, const BN_ULONG *n0, int num) 1087 { 1088 return 0; 1089 } 1090 # endif /* OPENSSL_BN_ASM_MONT */ 1091 # endif 1092 1093 #endif /* !BN_MUL_COMBA */ 1094