1 /* 2 * Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved. 3 * 4 * Licensed under the OpenSSL license (the "License"). You may not use 5 * this file except in compliance with the License. You can obtain a copy 6 * in the file LICENSE in the source distribution or at 7 * https://www.openssl.org/source/license.html 8 */ 9 10 #include <assert.h> 11 #include <openssl/crypto.h> 12 #include "internal/cryptlib.h" 13 #include "bn_local.h" 14 15 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) 16 17 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 18 BN_ULONG w) 19 { 20 BN_ULONG c1 = 0; 21 22 assert(num >= 0); 23 if (num <= 0) 24 return c1; 25 26 # ifndef OPENSSL_SMALL_FOOTPRINT 27 while (num & ~3) { 28 mul_add(rp[0], ap[0], w, c1); 29 mul_add(rp[1], ap[1], w, c1); 30 mul_add(rp[2], ap[2], w, c1); 31 mul_add(rp[3], ap[3], w, c1); 32 ap += 4; 33 rp += 4; 34 num -= 4; 35 } 36 # endif 37 while (num) { 38 mul_add(rp[0], ap[0], w, c1); 39 ap++; 40 rp++; 41 num--; 42 } 43 44 return c1; 45 } 46 47 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 48 { 49 BN_ULONG c1 = 0; 50 51 assert(num >= 0); 52 if (num <= 0) 53 return c1; 54 55 # ifndef OPENSSL_SMALL_FOOTPRINT 56 while (num & ~3) { 57 mul(rp[0], ap[0], w, c1); 58 mul(rp[1], ap[1], w, c1); 59 mul(rp[2], ap[2], w, c1); 60 mul(rp[3], ap[3], w, c1); 61 ap += 4; 62 rp += 4; 63 num -= 4; 64 } 65 # endif 66 while (num) { 67 mul(rp[0], ap[0], w, c1); 68 ap++; 69 rp++; 70 num--; 71 } 72 return c1; 73 } 74 75 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 76 { 77 assert(n >= 0); 78 if (n <= 0) 79 return; 80 81 # ifndef OPENSSL_SMALL_FOOTPRINT 82 while (n & ~3) { 83 sqr(r[0], r[1], a[0]); 84 sqr(r[2], r[3], a[1]); 85 sqr(r[4], r[5], a[2]); 86 sqr(r[6], r[7], a[3]); 87 a += 4; 88 r += 8; 89 n -= 4; 90 } 91 # endif 92 while (n) { 93 sqr(r[0], r[1], a[0]); 94 a++; 95 r += 2; 96 n--; 97 } 98 } 99 100 #else /* !(defined(BN_LLONG) || 101 * defined(BN_UMULT_HIGH)) */ 102 103 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 104 BN_ULONG w) 105 { 106 BN_ULONG c = 0; 107 BN_ULONG bl, bh; 108 109 assert(num >= 0); 110 if (num <= 0) 111 return (BN_ULONG)0; 112 113 bl = LBITS(w); 114 bh = HBITS(w); 115 116 # ifndef OPENSSL_SMALL_FOOTPRINT 117 while (num & ~3) { 118 mul_add(rp[0], ap[0], bl, bh, c); 119 mul_add(rp[1], ap[1], bl, bh, c); 120 mul_add(rp[2], ap[2], bl, bh, c); 121 mul_add(rp[3], ap[3], bl, bh, c); 122 ap += 4; 123 rp += 4; 124 num -= 4; 125 } 126 # endif 127 while (num) { 128 mul_add(rp[0], ap[0], bl, bh, c); 129 ap++; 130 rp++; 131 num--; 132 } 133 return c; 134 } 135 136 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 137 { 138 BN_ULONG carry = 0; 139 BN_ULONG bl, bh; 140 141 assert(num >= 0); 142 if (num <= 0) 143 return (BN_ULONG)0; 144 145 bl = LBITS(w); 146 bh = HBITS(w); 147 148 # ifndef OPENSSL_SMALL_FOOTPRINT 149 while (num & ~3) { 150 mul(rp[0], ap[0], bl, bh, carry); 151 mul(rp[1], ap[1], bl, bh, carry); 152 mul(rp[2], ap[2], bl, bh, carry); 153 mul(rp[3], ap[3], bl, bh, carry); 154 ap += 4; 155 rp += 4; 156 num -= 4; 157 } 158 # endif 159 while (num) { 160 mul(rp[0], ap[0], bl, bh, carry); 161 ap++; 162 rp++; 163 num--; 164 } 165 return carry; 166 } 167 168 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 169 { 170 assert(n >= 0); 171 if (n <= 0) 172 return; 173 174 # ifndef OPENSSL_SMALL_FOOTPRINT 175 while (n & ~3) { 176 sqr64(r[0], r[1], a[0]); 177 sqr64(r[2], r[3], a[1]); 178 sqr64(r[4], r[5], a[2]); 179 sqr64(r[6], r[7], a[3]); 180 a += 4; 181 r += 8; 182 n -= 4; 183 } 184 # endif 185 while (n) { 186 sqr64(r[0], r[1], a[0]); 187 a++; 188 r += 2; 189 n--; 190 } 191 } 192 193 #endif /* !(defined(BN_LLONG) || 194 * defined(BN_UMULT_HIGH)) */ 195 196 #if defined(BN_LLONG) && defined(BN_DIV2W) 197 198 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 199 { 200 return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d)); 201 } 202 203 #else 204 205 /* Divide h,l by d and return the result. */ 206 /* I need to test this some more :-( */ 207 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 208 { 209 BN_ULONG dh, dl, q, ret = 0, th, tl, t; 210 int i, count = 2; 211 212 if (d == 0) 213 return BN_MASK2; 214 215 i = BN_num_bits_word(d); 216 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); 217 218 i = BN_BITS2 - i; 219 if (h >= d) 220 h -= d; 221 222 if (i) { 223 d <<= i; 224 h = (h << i) | (l >> (BN_BITS2 - i)); 225 l <<= i; 226 } 227 dh = (d & BN_MASK2h) >> BN_BITS4; 228 dl = (d & BN_MASK2l); 229 for (;;) { 230 if ((h >> BN_BITS4) == dh) 231 q = BN_MASK2l; 232 else 233 q = h / dh; 234 235 th = q * dh; 236 tl = dl * q; 237 for (;;) { 238 t = h - th; 239 if ((t & BN_MASK2h) || 240 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) 241 break; 242 q--; 243 th -= dh; 244 tl -= dl; 245 } 246 t = (tl >> BN_BITS4); 247 tl = (tl << BN_BITS4) & BN_MASK2h; 248 th += t; 249 250 if (l < tl) 251 th++; 252 l -= tl; 253 if (h < th) { 254 h += d; 255 q--; 256 } 257 h -= th; 258 259 if (--count == 0) 260 break; 261 262 ret = q << BN_BITS4; 263 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; 264 l = (l & BN_MASK2l) << BN_BITS4; 265 } 266 ret |= q; 267 return ret; 268 } 269 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ 270 271 #ifdef BN_LLONG 272 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 273 int n) 274 { 275 BN_ULLONG ll = 0; 276 277 assert(n >= 0); 278 if (n <= 0) 279 return (BN_ULONG)0; 280 281 # ifndef OPENSSL_SMALL_FOOTPRINT 282 while (n & ~3) { 283 ll += (BN_ULLONG) a[0] + b[0]; 284 r[0] = (BN_ULONG)ll & BN_MASK2; 285 ll >>= BN_BITS2; 286 ll += (BN_ULLONG) a[1] + b[1]; 287 r[1] = (BN_ULONG)ll & BN_MASK2; 288 ll >>= BN_BITS2; 289 ll += (BN_ULLONG) a[2] + b[2]; 290 r[2] = (BN_ULONG)ll & BN_MASK2; 291 ll >>= BN_BITS2; 292 ll += (BN_ULLONG) a[3] + b[3]; 293 r[3] = (BN_ULONG)ll & BN_MASK2; 294 ll >>= BN_BITS2; 295 a += 4; 296 b += 4; 297 r += 4; 298 n -= 4; 299 } 300 # endif 301 while (n) { 302 ll += (BN_ULLONG) a[0] + b[0]; 303 r[0] = (BN_ULONG)ll & BN_MASK2; 304 ll >>= BN_BITS2; 305 a++; 306 b++; 307 r++; 308 n--; 309 } 310 return (BN_ULONG)ll; 311 } 312 #else /* !BN_LLONG */ 313 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 314 int n) 315 { 316 BN_ULONG c, l, t; 317 318 assert(n >= 0); 319 if (n <= 0) 320 return (BN_ULONG)0; 321 322 c = 0; 323 # ifndef OPENSSL_SMALL_FOOTPRINT 324 while (n & ~3) { 325 t = a[0]; 326 t = (t + c) & BN_MASK2; 327 c = (t < c); 328 l = (t + b[0]) & BN_MASK2; 329 c += (l < t); 330 r[0] = l; 331 t = a[1]; 332 t = (t + c) & BN_MASK2; 333 c = (t < c); 334 l = (t + b[1]) & BN_MASK2; 335 c += (l < t); 336 r[1] = l; 337 t = a[2]; 338 t = (t + c) & BN_MASK2; 339 c = (t < c); 340 l = (t + b[2]) & BN_MASK2; 341 c += (l < t); 342 r[2] = l; 343 t = a[3]; 344 t = (t + c) & BN_MASK2; 345 c = (t < c); 346 l = (t + b[3]) & BN_MASK2; 347 c += (l < t); 348 r[3] = l; 349 a += 4; 350 b += 4; 351 r += 4; 352 n -= 4; 353 } 354 # endif 355 while (n) { 356 t = a[0]; 357 t = (t + c) & BN_MASK2; 358 c = (t < c); 359 l = (t + b[0]) & BN_MASK2; 360 c += (l < t); 361 r[0] = l; 362 a++; 363 b++; 364 r++; 365 n--; 366 } 367 return (BN_ULONG)c; 368 } 369 #endif /* !BN_LLONG */ 370 371 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 372 int n) 373 { 374 BN_ULONG t1, t2; 375 int c = 0; 376 377 assert(n >= 0); 378 if (n <= 0) 379 return (BN_ULONG)0; 380 381 #ifndef OPENSSL_SMALL_FOOTPRINT 382 while (n & ~3) { 383 t1 = a[0]; 384 t2 = (t1 - c) & BN_MASK2; 385 c = (t2 > t1); 386 t1 = b[0]; 387 t1 = (t2 - t1) & BN_MASK2; 388 r[0] = t1; 389 c += (t1 > t2); 390 t1 = a[1]; 391 t2 = (t1 - c) & BN_MASK2; 392 c = (t2 > t1); 393 t1 = b[1]; 394 t1 = (t2 - t1) & BN_MASK2; 395 r[1] = t1; 396 c += (t1 > t2); 397 t1 = a[2]; 398 t2 = (t1 - c) & BN_MASK2; 399 c = (t2 > t1); 400 t1 = b[2]; 401 t1 = (t2 - t1) & BN_MASK2; 402 r[2] = t1; 403 c += (t1 > t2); 404 t1 = a[3]; 405 t2 = (t1 - c) & BN_MASK2; 406 c = (t2 > t1); 407 t1 = b[3]; 408 t1 = (t2 - t1) & BN_MASK2; 409 r[3] = t1; 410 c += (t1 > t2); 411 a += 4; 412 b += 4; 413 r += 4; 414 n -= 4; 415 } 416 #endif 417 while (n) { 418 t1 = a[0]; 419 t2 = (t1 - c) & BN_MASK2; 420 c = (t2 > t1); 421 t1 = b[0]; 422 t1 = (t2 - t1) & BN_MASK2; 423 r[0] = t1; 424 c += (t1 > t2); 425 a++; 426 b++; 427 r++; 428 n--; 429 } 430 return c; 431 } 432 433 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) 434 435 # undef bn_mul_comba8 436 # undef bn_mul_comba4 437 # undef bn_sqr_comba8 438 # undef bn_sqr_comba4 439 440 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 441 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 442 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 443 /* 444 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 445 * c=(c2,c1,c0) 446 */ 447 448 # ifdef BN_LLONG 449 /* 450 * Keep in mind that additions to multiplication result can not 451 * overflow, because its high half cannot be all-ones. 452 */ 453 # define mul_add_c(a,b,c0,c1,c2) do { \ 454 BN_ULONG hi; \ 455 BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 456 t += c0; /* no carry */ \ 457 c0 = (BN_ULONG)Lw(t); \ 458 hi = (BN_ULONG)Hw(t); \ 459 c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 460 } while(0) 461 462 # define mul_add_c2(a,b,c0,c1,c2) do { \ 463 BN_ULONG hi; \ 464 BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 465 BN_ULLONG tt = t+c0; /* no carry */ \ 466 c0 = (BN_ULONG)Lw(tt); \ 467 hi = (BN_ULONG)Hw(tt); \ 468 c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 469 t += c0; /* no carry */ \ 470 c0 = (BN_ULONG)Lw(t); \ 471 hi = (BN_ULONG)Hw(t); \ 472 c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 473 } while(0) 474 475 # define sqr_add_c(a,i,c0,c1,c2) do { \ 476 BN_ULONG hi; \ 477 BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \ 478 t += c0; /* no carry */ \ 479 c0 = (BN_ULONG)Lw(t); \ 480 hi = (BN_ULONG)Hw(t); \ 481 c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 482 } while(0) 483 484 # define sqr_add_c2(a,i,j,c0,c1,c2) \ 485 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 486 487 # elif defined(BN_UMULT_LOHI) 488 /* 489 * Keep in mind that additions to hi can not overflow, because 490 * the high word of a multiplication result cannot be all-ones. 491 */ 492 # define mul_add_c(a,b,c0,c1,c2) do { \ 493 BN_ULONG ta = (a), tb = (b); \ 494 BN_ULONG lo, hi; \ 495 BN_UMULT_LOHI(lo,hi,ta,tb); \ 496 c0 += lo; hi += (c0<lo); \ 497 c1 += hi; c2 += (c1<hi); \ 498 } while(0) 499 500 # define mul_add_c2(a,b,c0,c1,c2) do { \ 501 BN_ULONG ta = (a), tb = (b); \ 502 BN_ULONG lo, hi, tt; \ 503 BN_UMULT_LOHI(lo,hi,ta,tb); \ 504 c0 += lo; tt = hi + (c0<lo); \ 505 c1 += tt; c2 += (c1<tt); \ 506 c0 += lo; hi += (c0<lo); \ 507 c1 += hi; c2 += (c1<hi); \ 508 } while(0) 509 510 # define sqr_add_c(a,i,c0,c1,c2) do { \ 511 BN_ULONG ta = (a)[i]; \ 512 BN_ULONG lo, hi; \ 513 BN_UMULT_LOHI(lo,hi,ta,ta); \ 514 c0 += lo; hi += (c0<lo); \ 515 c1 += hi; c2 += (c1<hi); \ 516 } while(0) 517 518 # define sqr_add_c2(a,i,j,c0,c1,c2) \ 519 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 520 521 # elif defined(BN_UMULT_HIGH) 522 /* 523 * Keep in mind that additions to hi can not overflow, because 524 * the high word of a multiplication result cannot be all-ones. 525 */ 526 # define mul_add_c(a,b,c0,c1,c2) do { \ 527 BN_ULONG ta = (a), tb = (b); \ 528 BN_ULONG lo = ta * tb; \ 529 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 530 c0 += lo; hi += (c0<lo); \ 531 c1 += hi; c2 += (c1<hi); \ 532 } while(0) 533 534 # define mul_add_c2(a,b,c0,c1,c2) do { \ 535 BN_ULONG ta = (a), tb = (b), tt; \ 536 BN_ULONG lo = ta * tb; \ 537 BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 538 c0 += lo; tt = hi + (c0<lo); \ 539 c1 += tt; c2 += (c1<tt); \ 540 c0 += lo; hi += (c0<lo); \ 541 c1 += hi; c2 += (c1<hi); \ 542 } while(0) 543 544 # define sqr_add_c(a,i,c0,c1,c2) do { \ 545 BN_ULONG ta = (a)[i]; \ 546 BN_ULONG lo = ta * ta; \ 547 BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \ 548 c0 += lo; hi += (c0<lo); \ 549 c1 += hi; c2 += (c1<hi); \ 550 } while(0) 551 552 # define sqr_add_c2(a,i,j,c0,c1,c2) \ 553 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 554 555 # else /* !BN_LLONG */ 556 /* 557 * Keep in mind that additions to hi can not overflow, because 558 * the high word of a multiplication result cannot be all-ones. 559 */ 560 # define mul_add_c(a,b,c0,c1,c2) do { \ 561 BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 562 BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 563 mul64(lo,hi,bl,bh); \ 564 c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \ 565 c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 566 } while(0) 567 568 # define mul_add_c2(a,b,c0,c1,c2) do { \ 569 BN_ULONG tt; \ 570 BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 571 BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 572 mul64(lo,hi,bl,bh); \ 573 tt = hi; \ 574 c0 = (c0+lo)&BN_MASK2; tt += (c0<lo); \ 575 c1 = (c1+tt)&BN_MASK2; c2 += (c1<tt); \ 576 c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \ 577 c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 578 } while(0) 579 580 # define sqr_add_c(a,i,c0,c1,c2) do { \ 581 BN_ULONG lo, hi; \ 582 sqr64(lo,hi,(a)[i]); \ 583 c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \ 584 c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 585 } while(0) 586 587 # define sqr_add_c2(a,i,j,c0,c1,c2) \ 588 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 589 # endif /* !BN_LLONG */ 590 591 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 592 { 593 BN_ULONG c1, c2, c3; 594 595 c1 = 0; 596 c2 = 0; 597 c3 = 0; 598 mul_add_c(a[0], b[0], c1, c2, c3); 599 r[0] = c1; 600 c1 = 0; 601 mul_add_c(a[0], b[1], c2, c3, c1); 602 mul_add_c(a[1], b[0], c2, c3, c1); 603 r[1] = c2; 604 c2 = 0; 605 mul_add_c(a[2], b[0], c3, c1, c2); 606 mul_add_c(a[1], b[1], c3, c1, c2); 607 mul_add_c(a[0], b[2], c3, c1, c2); 608 r[2] = c3; 609 c3 = 0; 610 mul_add_c(a[0], b[3], c1, c2, c3); 611 mul_add_c(a[1], b[2], c1, c2, c3); 612 mul_add_c(a[2], b[1], c1, c2, c3); 613 mul_add_c(a[3], b[0], c1, c2, c3); 614 r[3] = c1; 615 c1 = 0; 616 mul_add_c(a[4], b[0], c2, c3, c1); 617 mul_add_c(a[3], b[1], c2, c3, c1); 618 mul_add_c(a[2], b[2], c2, c3, c1); 619 mul_add_c(a[1], b[3], c2, c3, c1); 620 mul_add_c(a[0], b[4], c2, c3, c1); 621 r[4] = c2; 622 c2 = 0; 623 mul_add_c(a[0], b[5], c3, c1, c2); 624 mul_add_c(a[1], b[4], c3, c1, c2); 625 mul_add_c(a[2], b[3], c3, c1, c2); 626 mul_add_c(a[3], b[2], c3, c1, c2); 627 mul_add_c(a[4], b[1], c3, c1, c2); 628 mul_add_c(a[5], b[0], c3, c1, c2); 629 r[5] = c3; 630 c3 = 0; 631 mul_add_c(a[6], b[0], c1, c2, c3); 632 mul_add_c(a[5], b[1], c1, c2, c3); 633 mul_add_c(a[4], b[2], c1, c2, c3); 634 mul_add_c(a[3], b[3], c1, c2, c3); 635 mul_add_c(a[2], b[4], c1, c2, c3); 636 mul_add_c(a[1], b[5], c1, c2, c3); 637 mul_add_c(a[0], b[6], c1, c2, c3); 638 r[6] = c1; 639 c1 = 0; 640 mul_add_c(a[0], b[7], c2, c3, c1); 641 mul_add_c(a[1], b[6], c2, c3, c1); 642 mul_add_c(a[2], b[5], c2, c3, c1); 643 mul_add_c(a[3], b[4], c2, c3, c1); 644 mul_add_c(a[4], b[3], c2, c3, c1); 645 mul_add_c(a[5], b[2], c2, c3, c1); 646 mul_add_c(a[6], b[1], c2, c3, c1); 647 mul_add_c(a[7], b[0], c2, c3, c1); 648 r[7] = c2; 649 c2 = 0; 650 mul_add_c(a[7], b[1], c3, c1, c2); 651 mul_add_c(a[6], b[2], c3, c1, c2); 652 mul_add_c(a[5], b[3], c3, c1, c2); 653 mul_add_c(a[4], b[4], c3, c1, c2); 654 mul_add_c(a[3], b[5], c3, c1, c2); 655 mul_add_c(a[2], b[6], c3, c1, c2); 656 mul_add_c(a[1], b[7], c3, c1, c2); 657 r[8] = c3; 658 c3 = 0; 659 mul_add_c(a[2], b[7], c1, c2, c3); 660 mul_add_c(a[3], b[6], c1, c2, c3); 661 mul_add_c(a[4], b[5], c1, c2, c3); 662 mul_add_c(a[5], b[4], c1, c2, c3); 663 mul_add_c(a[6], b[3], c1, c2, c3); 664 mul_add_c(a[7], b[2], c1, c2, c3); 665 r[9] = c1; 666 c1 = 0; 667 mul_add_c(a[7], b[3], c2, c3, c1); 668 mul_add_c(a[6], b[4], c2, c3, c1); 669 mul_add_c(a[5], b[5], c2, c3, c1); 670 mul_add_c(a[4], b[6], c2, c3, c1); 671 mul_add_c(a[3], b[7], c2, c3, c1); 672 r[10] = c2; 673 c2 = 0; 674 mul_add_c(a[4], b[7], c3, c1, c2); 675 mul_add_c(a[5], b[6], c3, c1, c2); 676 mul_add_c(a[6], b[5], c3, c1, c2); 677 mul_add_c(a[7], b[4], c3, c1, c2); 678 r[11] = c3; 679 c3 = 0; 680 mul_add_c(a[7], b[5], c1, c2, c3); 681 mul_add_c(a[6], b[6], c1, c2, c3); 682 mul_add_c(a[5], b[7], c1, c2, c3); 683 r[12] = c1; 684 c1 = 0; 685 mul_add_c(a[6], b[7], c2, c3, c1); 686 mul_add_c(a[7], b[6], c2, c3, c1); 687 r[13] = c2; 688 c2 = 0; 689 mul_add_c(a[7], b[7], c3, c1, c2); 690 r[14] = c3; 691 r[15] = c1; 692 } 693 694 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 695 { 696 BN_ULONG c1, c2, c3; 697 698 c1 = 0; 699 c2 = 0; 700 c3 = 0; 701 mul_add_c(a[0], b[0], c1, c2, c3); 702 r[0] = c1; 703 c1 = 0; 704 mul_add_c(a[0], b[1], c2, c3, c1); 705 mul_add_c(a[1], b[0], c2, c3, c1); 706 r[1] = c2; 707 c2 = 0; 708 mul_add_c(a[2], b[0], c3, c1, c2); 709 mul_add_c(a[1], b[1], c3, c1, c2); 710 mul_add_c(a[0], b[2], c3, c1, c2); 711 r[2] = c3; 712 c3 = 0; 713 mul_add_c(a[0], b[3], c1, c2, c3); 714 mul_add_c(a[1], b[2], c1, c2, c3); 715 mul_add_c(a[2], b[1], c1, c2, c3); 716 mul_add_c(a[3], b[0], c1, c2, c3); 717 r[3] = c1; 718 c1 = 0; 719 mul_add_c(a[3], b[1], c2, c3, c1); 720 mul_add_c(a[2], b[2], c2, c3, c1); 721 mul_add_c(a[1], b[3], c2, c3, c1); 722 r[4] = c2; 723 c2 = 0; 724 mul_add_c(a[2], b[3], c3, c1, c2); 725 mul_add_c(a[3], b[2], c3, c1, c2); 726 r[5] = c3; 727 c3 = 0; 728 mul_add_c(a[3], b[3], c1, c2, c3); 729 r[6] = c1; 730 r[7] = c2; 731 } 732 733 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 734 { 735 BN_ULONG c1, c2, c3; 736 737 c1 = 0; 738 c2 = 0; 739 c3 = 0; 740 sqr_add_c(a, 0, c1, c2, c3); 741 r[0] = c1; 742 c1 = 0; 743 sqr_add_c2(a, 1, 0, c2, c3, c1); 744 r[1] = c2; 745 c2 = 0; 746 sqr_add_c(a, 1, c3, c1, c2); 747 sqr_add_c2(a, 2, 0, c3, c1, c2); 748 r[2] = c3; 749 c3 = 0; 750 sqr_add_c2(a, 3, 0, c1, c2, c3); 751 sqr_add_c2(a, 2, 1, c1, c2, c3); 752 r[3] = c1; 753 c1 = 0; 754 sqr_add_c(a, 2, c2, c3, c1); 755 sqr_add_c2(a, 3, 1, c2, c3, c1); 756 sqr_add_c2(a, 4, 0, c2, c3, c1); 757 r[4] = c2; 758 c2 = 0; 759 sqr_add_c2(a, 5, 0, c3, c1, c2); 760 sqr_add_c2(a, 4, 1, c3, c1, c2); 761 sqr_add_c2(a, 3, 2, c3, c1, c2); 762 r[5] = c3; 763 c3 = 0; 764 sqr_add_c(a, 3, c1, c2, c3); 765 sqr_add_c2(a, 4, 2, c1, c2, c3); 766 sqr_add_c2(a, 5, 1, c1, c2, c3); 767 sqr_add_c2(a, 6, 0, c1, c2, c3); 768 r[6] = c1; 769 c1 = 0; 770 sqr_add_c2(a, 7, 0, c2, c3, c1); 771 sqr_add_c2(a, 6, 1, c2, c3, c1); 772 sqr_add_c2(a, 5, 2, c2, c3, c1); 773 sqr_add_c2(a, 4, 3, c2, c3, c1); 774 r[7] = c2; 775 c2 = 0; 776 sqr_add_c(a, 4, c3, c1, c2); 777 sqr_add_c2(a, 5, 3, c3, c1, c2); 778 sqr_add_c2(a, 6, 2, c3, c1, c2); 779 sqr_add_c2(a, 7, 1, c3, c1, c2); 780 r[8] = c3; 781 c3 = 0; 782 sqr_add_c2(a, 7, 2, c1, c2, c3); 783 sqr_add_c2(a, 6, 3, c1, c2, c3); 784 sqr_add_c2(a, 5, 4, c1, c2, c3); 785 r[9] = c1; 786 c1 = 0; 787 sqr_add_c(a, 5, c2, c3, c1); 788 sqr_add_c2(a, 6, 4, c2, c3, c1); 789 sqr_add_c2(a, 7, 3, c2, c3, c1); 790 r[10] = c2; 791 c2 = 0; 792 sqr_add_c2(a, 7, 4, c3, c1, c2); 793 sqr_add_c2(a, 6, 5, c3, c1, c2); 794 r[11] = c3; 795 c3 = 0; 796 sqr_add_c(a, 6, c1, c2, c3); 797 sqr_add_c2(a, 7, 5, c1, c2, c3); 798 r[12] = c1; 799 c1 = 0; 800 sqr_add_c2(a, 7, 6, c2, c3, c1); 801 r[13] = c2; 802 c2 = 0; 803 sqr_add_c(a, 7, c3, c1, c2); 804 r[14] = c3; 805 r[15] = c1; 806 } 807 808 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 809 { 810 BN_ULONG c1, c2, c3; 811 812 c1 = 0; 813 c2 = 0; 814 c3 = 0; 815 sqr_add_c(a, 0, c1, c2, c3); 816 r[0] = c1; 817 c1 = 0; 818 sqr_add_c2(a, 1, 0, c2, c3, c1); 819 r[1] = c2; 820 c2 = 0; 821 sqr_add_c(a, 1, c3, c1, c2); 822 sqr_add_c2(a, 2, 0, c3, c1, c2); 823 r[2] = c3; 824 c3 = 0; 825 sqr_add_c2(a, 3, 0, c1, c2, c3); 826 sqr_add_c2(a, 2, 1, c1, c2, c3); 827 r[3] = c1; 828 c1 = 0; 829 sqr_add_c(a, 2, c2, c3, c1); 830 sqr_add_c2(a, 3, 1, c2, c3, c1); 831 r[4] = c2; 832 c2 = 0; 833 sqr_add_c2(a, 3, 2, c3, c1, c2); 834 r[5] = c3; 835 c3 = 0; 836 sqr_add_c(a, 3, c1, c2, c3); 837 r[6] = c1; 838 r[7] = c2; 839 } 840 841 # ifdef OPENSSL_NO_ASM 842 # ifdef OPENSSL_BN_ASM_MONT 843 # include <alloca.h> 844 /* 845 * This is essentially reference implementation, which may or may not 846 * result in performance improvement. E.g. on IA-32 this routine was 847 * observed to give 40% faster rsa1024 private key operations and 10% 848 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only 849 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a 850 * reference implementation, one to be used as starting point for 851 * platform-specific assembler. Mentioned numbers apply to compiler 852 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and 853 * can vary not only from platform to platform, but even for compiler 854 * versions. Assembler vs. assembler improvement coefficients can 855 * [and are known to] differ and are to be documented elsewhere. 856 */ 857 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 858 const BN_ULONG *np, const BN_ULONG *n0p, int num) 859 { 860 BN_ULONG c0, c1, ml, *tp, n0; 861 # ifdef mul64 862 BN_ULONG mh; 863 # endif 864 volatile BN_ULONG *vp; 865 int i = 0, j; 866 867 # if 0 /* template for platform-specific 868 * implementation */ 869 if (ap == bp) 870 return bn_sqr_mont(rp, ap, np, n0p, num); 871 # endif 872 vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 873 874 n0 = *n0p; 875 876 c0 = 0; 877 ml = bp[0]; 878 # ifdef mul64 879 mh = HBITS(ml); 880 ml = LBITS(ml); 881 for (j = 0; j < num; ++j) 882 mul(tp[j], ap[j], ml, mh, c0); 883 # else 884 for (j = 0; j < num; ++j) 885 mul(tp[j], ap[j], ml, c0); 886 # endif 887 888 tp[num] = c0; 889 tp[num + 1] = 0; 890 goto enter; 891 892 for (i = 0; i < num; i++) { 893 c0 = 0; 894 ml = bp[i]; 895 # ifdef mul64 896 mh = HBITS(ml); 897 ml = LBITS(ml); 898 for (j = 0; j < num; ++j) 899 mul_add(tp[j], ap[j], ml, mh, c0); 900 # else 901 for (j = 0; j < num; ++j) 902 mul_add(tp[j], ap[j], ml, c0); 903 # endif 904 c1 = (tp[num] + c0) & BN_MASK2; 905 tp[num] = c1; 906 tp[num + 1] = (c1 < c0 ? 1 : 0); 907 enter: 908 c1 = tp[0]; 909 ml = (c1 * n0) & BN_MASK2; 910 c0 = 0; 911 # ifdef mul64 912 mh = HBITS(ml); 913 ml = LBITS(ml); 914 mul_add(c1, np[0], ml, mh, c0); 915 # else 916 mul_add(c1, ml, np[0], c0); 917 # endif 918 for (j = 1; j < num; j++) { 919 c1 = tp[j]; 920 # ifdef mul64 921 mul_add(c1, np[j], ml, mh, c0); 922 # else 923 mul_add(c1, ml, np[j], c0); 924 # endif 925 tp[j - 1] = c1 & BN_MASK2; 926 } 927 c1 = (tp[num] + c0) & BN_MASK2; 928 tp[num - 1] = c1; 929 tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); 930 } 931 932 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 933 c0 = bn_sub_words(rp, tp, np, num); 934 if (tp[num] != 0 || c0 == 0) { 935 for (i = 0; i < num + 2; i++) 936 vp[i] = 0; 937 return 1; 938 } 939 } 940 for (i = 0; i < num; i++) 941 rp[i] = tp[i], vp[i] = 0; 942 vp[num] = 0; 943 vp[num + 1] = 0; 944 return 1; 945 } 946 # else 947 /* 948 * Return value of 0 indicates that multiplication/convolution was not 949 * performed to signal the caller to fall down to alternative/original 950 * code-path. 951 */ 952 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 953 const BN_ULONG *np, const BN_ULONG *n0, int num) 954 { 955 return 0; 956 } 957 # endif /* OPENSSL_BN_ASM_MONT */ 958 # endif 959 960 #else /* !BN_MUL_COMBA */ 961 962 /* hmm... is it faster just to do a multiply? */ 963 # undef bn_sqr_comba4 964 # undef bn_sqr_comba8 965 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 966 { 967 BN_ULONG t[8]; 968 bn_sqr_normal(r, a, 4, t); 969 } 970 971 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 972 { 973 BN_ULONG t[16]; 974 bn_sqr_normal(r, a, 8, t); 975 } 976 977 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 978 { 979 r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); 980 r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); 981 r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); 982 r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); 983 } 984 985 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 986 { 987 r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); 988 r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); 989 r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); 990 r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); 991 r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); 992 r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); 993 r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); 994 r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); 995 } 996 997 # ifdef OPENSSL_NO_ASM 998 # ifdef OPENSSL_BN_ASM_MONT 999 # include <alloca.h> 1000 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1001 const BN_ULONG *np, const BN_ULONG *n0p, int num) 1002 { 1003 BN_ULONG c0, c1, *tp, n0 = *n0p; 1004 volatile BN_ULONG *vp; 1005 int i = 0, j; 1006 1007 vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 1008 1009 for (i = 0; i <= num; i++) 1010 tp[i] = 0; 1011 1012 for (i = 0; i < num; i++) { 1013 c0 = bn_mul_add_words(tp, ap, num, bp[i]); 1014 c1 = (tp[num] + c0) & BN_MASK2; 1015 tp[num] = c1; 1016 tp[num + 1] = (c1 < c0 ? 1 : 0); 1017 1018 c0 = bn_mul_add_words(tp, np, num, tp[0] * n0); 1019 c1 = (tp[num] + c0) & BN_MASK2; 1020 tp[num] = c1; 1021 tp[num + 1] += (c1 < c0 ? 1 : 0); 1022 for (j = 0; j <= num; j++) 1023 tp[j] = tp[j + 1]; 1024 } 1025 1026 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 1027 c0 = bn_sub_words(rp, tp, np, num); 1028 if (tp[num] != 0 || c0 == 0) { 1029 for (i = 0; i < num + 2; i++) 1030 vp[i] = 0; 1031 return 1; 1032 } 1033 } 1034 for (i = 0; i < num; i++) 1035 rp[i] = tp[i], vp[i] = 0; 1036 vp[num] = 0; 1037 vp[num + 1] = 0; 1038 return 1; 1039 } 1040 # else 1041 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1042 const BN_ULONG *np, const BN_ULONG *n0, int num) 1043 { 1044 return 0; 1045 } 1046 # endif /* OPENSSL_BN_ASM_MONT */ 1047 # endif 1048 1049 #endif /* !BN_MUL_COMBA */ 1050