1 /* 2 * Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved. 3 * 4 * Licensed under the OpenSSL license (the "License"). You may not use 5 * this file except in compliance with the License. You can obtain a copy 6 * in the file LICENSE in the source distribution or at 7 * https://www.openssl.org/source/license.html 8 */ 9 10 #include "../bn_lcl.h" 11 #if !(defined(__GNUC__) && __GNUC__>=2) 12 # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ 13 #else 14 /*- 15 * x86_64 BIGNUM accelerator version 0.1, December 2002. 16 * 17 * Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL 18 * project. 19 * 20 * Rights for redistribution and usage in source and binary forms are 21 * granted according to the OpenSSL license. Warranty of any kind is 22 * disclaimed. 23 * 24 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 25 * versions, like 1.0... 26 * A. Well, that's because this code is basically a quick-n-dirty 27 * proof-of-concept hack. As you can see it's implemented with 28 * inline assembler, which means that you're bound to GCC and that 29 * there might be enough room for further improvement. 30 * 31 * Q. Why inline assembler? 32 * A. x86_64 features own ABI which I'm not familiar with. This is 33 * why I decided to let the compiler take care of subroutine 34 * prologue/epilogue as well as register allocation. For reference. 35 * Win64 implements different ABI for AMD64, different from Linux. 36 * 37 * Q. How much faster does it get? 38 * A. 'apps/openssl speed rsa dsa' output with no-asm: 39 * 40 * sign verify sign/s verify/s 41 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 42 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 43 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 44 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 45 * sign verify sign/s verify/s 46 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 47 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 48 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 49 * 50 * 'apps/openssl speed rsa dsa' output with this module: 51 * 52 * sign verify sign/s verify/s 53 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 54 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 55 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 56 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 57 * sign verify sign/s verify/s 58 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 59 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 60 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 61 * 62 * For the reference. IA-32 assembler implementation performs 63 * very much like 64-bit code compiled with no-asm on the same 64 * machine. 65 */ 66 67 # undef mul 68 # undef mul_add 69 70 /*- 71 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; 72 * "g"(0) let the compiler to decide where does it 73 * want to keep the value of zero; 74 */ 75 # define mul_add(r,a,word,carry) do { \ 76 register BN_ULONG high,low; \ 77 asm ("mulq %3" \ 78 : "=a"(low),"=d"(high) \ 79 : "a"(word),"m"(a) \ 80 : "cc"); \ 81 asm ("addq %2,%0; adcq %3,%1" \ 82 : "+r"(carry),"+d"(high)\ 83 : "a"(low),"g"(0) \ 84 : "cc"); \ 85 asm ("addq %2,%0; adcq %3,%1" \ 86 : "+m"(r),"+d"(high) \ 87 : "r"(carry),"g"(0) \ 88 : "cc"); \ 89 carry=high; \ 90 } while (0) 91 92 # define mul(r,a,word,carry) do { \ 93 register BN_ULONG high,low; \ 94 asm ("mulq %3" \ 95 : "=a"(low),"=d"(high) \ 96 : "a"(word),"g"(a) \ 97 : "cc"); \ 98 asm ("addq %2,%0; adcq %3,%1" \ 99 : "+r"(carry),"+d"(high)\ 100 : "a"(low),"g"(0) \ 101 : "cc"); \ 102 (r)=carry, carry=high; \ 103 } while (0) 104 # undef sqr 105 # define sqr(r0,r1,a) \ 106 asm ("mulq %2" \ 107 : "=a"(r0),"=d"(r1) \ 108 : "a"(a) \ 109 : "cc"); 110 111 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 112 BN_ULONG w) 113 { 114 BN_ULONG c1 = 0; 115 116 if (num <= 0) 117 return c1; 118 119 while (num & ~3) { 120 mul_add(rp[0], ap[0], w, c1); 121 mul_add(rp[1], ap[1], w, c1); 122 mul_add(rp[2], ap[2], w, c1); 123 mul_add(rp[3], ap[3], w, c1); 124 ap += 4; 125 rp += 4; 126 num -= 4; 127 } 128 if (num) { 129 mul_add(rp[0], ap[0], w, c1); 130 if (--num == 0) 131 return c1; 132 mul_add(rp[1], ap[1], w, c1); 133 if (--num == 0) 134 return c1; 135 mul_add(rp[2], ap[2], w, c1); 136 return c1; 137 } 138 139 return c1; 140 } 141 142 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 143 { 144 BN_ULONG c1 = 0; 145 146 if (num <= 0) 147 return c1; 148 149 while (num & ~3) { 150 mul(rp[0], ap[0], w, c1); 151 mul(rp[1], ap[1], w, c1); 152 mul(rp[2], ap[2], w, c1); 153 mul(rp[3], ap[3], w, c1); 154 ap += 4; 155 rp += 4; 156 num -= 4; 157 } 158 if (num) { 159 mul(rp[0], ap[0], w, c1); 160 if (--num == 0) 161 return c1; 162 mul(rp[1], ap[1], w, c1); 163 if (--num == 0) 164 return c1; 165 mul(rp[2], ap[2], w, c1); 166 } 167 return c1; 168 } 169 170 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 171 { 172 if (n <= 0) 173 return; 174 175 while (n & ~3) { 176 sqr(r[0], r[1], a[0]); 177 sqr(r[2], r[3], a[1]); 178 sqr(r[4], r[5], a[2]); 179 sqr(r[6], r[7], a[3]); 180 a += 4; 181 r += 8; 182 n -= 4; 183 } 184 if (n) { 185 sqr(r[0], r[1], a[0]); 186 if (--n == 0) 187 return; 188 sqr(r[2], r[3], a[1]); 189 if (--n == 0) 190 return; 191 sqr(r[4], r[5], a[2]); 192 } 193 } 194 195 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 196 { 197 BN_ULONG ret, waste; 198 199 asm("divq %4":"=a"(ret), "=d"(waste) 200 : "a"(l), "d"(h), "r"(d) 201 : "cc"); 202 203 return ret; 204 } 205 206 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 207 int n) 208 { 209 BN_ULONG ret; 210 size_t i = 0; 211 212 if (n <= 0) 213 return 0; 214 215 asm volatile (" subq %0,%0 \n" /* clear carry */ 216 " jmp 1f \n" 217 ".p2align 4 \n" 218 "1: movq (%4,%2,8),%0 \n" 219 " adcq (%5,%2,8),%0 \n" 220 " movq %0,(%3,%2,8) \n" 221 " lea 1(%2),%2 \n" 222 " dec %1 \n" 223 " jnz 1b \n" 224 " sbbq %0,%0 \n" 225 :"=&r" (ret), "+c"(n), "+r"(i) 226 :"r"(rp), "r"(ap), "r"(bp) 227 :"cc", "memory"); 228 229 return ret & 1; 230 } 231 232 # ifndef SIMICS 233 BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 234 int n) 235 { 236 BN_ULONG ret; 237 size_t i = 0; 238 239 if (n <= 0) 240 return 0; 241 242 asm volatile (" subq %0,%0 \n" /* clear borrow */ 243 " jmp 1f \n" 244 ".p2align 4 \n" 245 "1: movq (%4,%2,8),%0 \n" 246 " sbbq (%5,%2,8),%0 \n" 247 " movq %0,(%3,%2,8) \n" 248 " lea 1(%2),%2 \n" 249 " dec %1 \n" 250 " jnz 1b \n" 251 " sbbq %0,%0 \n" 252 :"=&r" (ret), "+c"(n), "+r"(i) 253 :"r"(rp), "r"(ap), "r"(bp) 254 :"cc", "memory"); 255 256 return ret & 1; 257 } 258 # else 259 /* Simics 1.4<7 has buggy sbbq:-( */ 260 # define BN_MASK2 0xffffffffffffffffL 261 BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 262 { 263 BN_ULONG t1, t2; 264 int c = 0; 265 266 if (n <= 0) 267 return (BN_ULONG)0; 268 269 for (;;) { 270 t1 = a[0]; 271 t2 = b[0]; 272 r[0] = (t1 - t2 - c) & BN_MASK2; 273 if (t1 != t2) 274 c = (t1 < t2); 275 if (--n <= 0) 276 break; 277 278 t1 = a[1]; 279 t2 = b[1]; 280 r[1] = (t1 - t2 - c) & BN_MASK2; 281 if (t1 != t2) 282 c = (t1 < t2); 283 if (--n <= 0) 284 break; 285 286 t1 = a[2]; 287 t2 = b[2]; 288 r[2] = (t1 - t2 - c) & BN_MASK2; 289 if (t1 != t2) 290 c = (t1 < t2); 291 if (--n <= 0) 292 break; 293 294 t1 = a[3]; 295 t2 = b[3]; 296 r[3] = (t1 - t2 - c) & BN_MASK2; 297 if (t1 != t2) 298 c = (t1 < t2); 299 if (--n <= 0) 300 break; 301 302 a += 4; 303 b += 4; 304 r += 4; 305 } 306 return c; 307 } 308 # endif 309 310 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 311 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 312 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 313 /* 314 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 315 * c=(c2,c1,c0) 316 */ 317 318 /* 319 * Keep in mind that carrying into high part of multiplication result 320 * can not overflow, because it cannot be all-ones. 321 */ 322 # if 0 323 /* original macros are kept for reference purposes */ 324 # define mul_add_c(a,b,c0,c1,c2) do { \ 325 BN_ULONG ta = (a), tb = (b); \ 326 BN_ULONG lo, hi; \ 327 BN_UMULT_LOHI(lo,hi,ta,tb); \ 328 c0 += lo; hi += (c0<lo)?1:0; \ 329 c1 += hi; c2 += (c1<hi)?1:0; \ 330 } while(0) 331 332 # define mul_add_c2(a,b,c0,c1,c2) do { \ 333 BN_ULONG ta = (a), tb = (b); \ 334 BN_ULONG lo, hi, tt; \ 335 BN_UMULT_LOHI(lo,hi,ta,tb); \ 336 c0 += lo; tt = hi+((c0<lo)?1:0); \ 337 c1 += tt; c2 += (c1<tt)?1:0; \ 338 c0 += lo; hi += (c0<lo)?1:0; \ 339 c1 += hi; c2 += (c1<hi)?1:0; \ 340 } while(0) 341 342 # define sqr_add_c(a,i,c0,c1,c2) do { \ 343 BN_ULONG ta = (a)[i]; \ 344 BN_ULONG lo, hi; \ 345 BN_UMULT_LOHI(lo,hi,ta,ta); \ 346 c0 += lo; hi += (c0<lo)?1:0; \ 347 c1 += hi; c2 += (c1<hi)?1:0; \ 348 } while(0) 349 # else 350 # define mul_add_c(a,b,c0,c1,c2) do { \ 351 BN_ULONG t1,t2; \ 352 asm ("mulq %3" \ 353 : "=a"(t1),"=d"(t2) \ 354 : "a"(a),"m"(b) \ 355 : "cc"); \ 356 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 357 : "+r"(c0),"+r"(c1),"+r"(c2) \ 358 : "r"(t1),"r"(t2),"g"(0) \ 359 : "cc"); \ 360 } while (0) 361 362 # define sqr_add_c(a,i,c0,c1,c2) do { \ 363 BN_ULONG t1,t2; \ 364 asm ("mulq %2" \ 365 : "=a"(t1),"=d"(t2) \ 366 : "a"(a[i]) \ 367 : "cc"); \ 368 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 369 : "+r"(c0),"+r"(c1),"+r"(c2) \ 370 : "r"(t1),"r"(t2),"g"(0) \ 371 : "cc"); \ 372 } while (0) 373 374 # define mul_add_c2(a,b,c0,c1,c2) do { \ 375 BN_ULONG t1,t2; \ 376 asm ("mulq %3" \ 377 : "=a"(t1),"=d"(t2) \ 378 : "a"(a),"m"(b) \ 379 : "cc"); \ 380 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 381 : "+r"(c0),"+r"(c1),"+r"(c2) \ 382 : "r"(t1),"r"(t2),"g"(0) \ 383 : "cc"); \ 384 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 385 : "+r"(c0),"+r"(c1),"+r"(c2) \ 386 : "r"(t1),"r"(t2),"g"(0) \ 387 : "cc"); \ 388 } while (0) 389 # endif 390 391 # define sqr_add_c2(a,i,j,c0,c1,c2) \ 392 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 393 394 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 395 { 396 BN_ULONG c1, c2, c3; 397 398 c1 = 0; 399 c2 = 0; 400 c3 = 0; 401 mul_add_c(a[0], b[0], c1, c2, c3); 402 r[0] = c1; 403 c1 = 0; 404 mul_add_c(a[0], b[1], c2, c3, c1); 405 mul_add_c(a[1], b[0], c2, c3, c1); 406 r[1] = c2; 407 c2 = 0; 408 mul_add_c(a[2], b[0], c3, c1, c2); 409 mul_add_c(a[1], b[1], c3, c1, c2); 410 mul_add_c(a[0], b[2], c3, c1, c2); 411 r[2] = c3; 412 c3 = 0; 413 mul_add_c(a[0], b[3], c1, c2, c3); 414 mul_add_c(a[1], b[2], c1, c2, c3); 415 mul_add_c(a[2], b[1], c1, c2, c3); 416 mul_add_c(a[3], b[0], c1, c2, c3); 417 r[3] = c1; 418 c1 = 0; 419 mul_add_c(a[4], b[0], c2, c3, c1); 420 mul_add_c(a[3], b[1], c2, c3, c1); 421 mul_add_c(a[2], b[2], c2, c3, c1); 422 mul_add_c(a[1], b[3], c2, c3, c1); 423 mul_add_c(a[0], b[4], c2, c3, c1); 424 r[4] = c2; 425 c2 = 0; 426 mul_add_c(a[0], b[5], c3, c1, c2); 427 mul_add_c(a[1], b[4], c3, c1, c2); 428 mul_add_c(a[2], b[3], c3, c1, c2); 429 mul_add_c(a[3], b[2], c3, c1, c2); 430 mul_add_c(a[4], b[1], c3, c1, c2); 431 mul_add_c(a[5], b[0], c3, c1, c2); 432 r[5] = c3; 433 c3 = 0; 434 mul_add_c(a[6], b[0], c1, c2, c3); 435 mul_add_c(a[5], b[1], c1, c2, c3); 436 mul_add_c(a[4], b[2], c1, c2, c3); 437 mul_add_c(a[3], b[3], c1, c2, c3); 438 mul_add_c(a[2], b[4], c1, c2, c3); 439 mul_add_c(a[1], b[5], c1, c2, c3); 440 mul_add_c(a[0], b[6], c1, c2, c3); 441 r[6] = c1; 442 c1 = 0; 443 mul_add_c(a[0], b[7], c2, c3, c1); 444 mul_add_c(a[1], b[6], c2, c3, c1); 445 mul_add_c(a[2], b[5], c2, c3, c1); 446 mul_add_c(a[3], b[4], c2, c3, c1); 447 mul_add_c(a[4], b[3], c2, c3, c1); 448 mul_add_c(a[5], b[2], c2, c3, c1); 449 mul_add_c(a[6], b[1], c2, c3, c1); 450 mul_add_c(a[7], b[0], c2, c3, c1); 451 r[7] = c2; 452 c2 = 0; 453 mul_add_c(a[7], b[1], c3, c1, c2); 454 mul_add_c(a[6], b[2], c3, c1, c2); 455 mul_add_c(a[5], b[3], c3, c1, c2); 456 mul_add_c(a[4], b[4], c3, c1, c2); 457 mul_add_c(a[3], b[5], c3, c1, c2); 458 mul_add_c(a[2], b[6], c3, c1, c2); 459 mul_add_c(a[1], b[7], c3, c1, c2); 460 r[8] = c3; 461 c3 = 0; 462 mul_add_c(a[2], b[7], c1, c2, c3); 463 mul_add_c(a[3], b[6], c1, c2, c3); 464 mul_add_c(a[4], b[5], c1, c2, c3); 465 mul_add_c(a[5], b[4], c1, c2, c3); 466 mul_add_c(a[6], b[3], c1, c2, c3); 467 mul_add_c(a[7], b[2], c1, c2, c3); 468 r[9] = c1; 469 c1 = 0; 470 mul_add_c(a[7], b[3], c2, c3, c1); 471 mul_add_c(a[6], b[4], c2, c3, c1); 472 mul_add_c(a[5], b[5], c2, c3, c1); 473 mul_add_c(a[4], b[6], c2, c3, c1); 474 mul_add_c(a[3], b[7], c2, c3, c1); 475 r[10] = c2; 476 c2 = 0; 477 mul_add_c(a[4], b[7], c3, c1, c2); 478 mul_add_c(a[5], b[6], c3, c1, c2); 479 mul_add_c(a[6], b[5], c3, c1, c2); 480 mul_add_c(a[7], b[4], c3, c1, c2); 481 r[11] = c3; 482 c3 = 0; 483 mul_add_c(a[7], b[5], c1, c2, c3); 484 mul_add_c(a[6], b[6], c1, c2, c3); 485 mul_add_c(a[5], b[7], c1, c2, c3); 486 r[12] = c1; 487 c1 = 0; 488 mul_add_c(a[6], b[7], c2, c3, c1); 489 mul_add_c(a[7], b[6], c2, c3, c1); 490 r[13] = c2; 491 c2 = 0; 492 mul_add_c(a[7], b[7], c3, c1, c2); 493 r[14] = c3; 494 r[15] = c1; 495 } 496 497 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 498 { 499 BN_ULONG c1, c2, c3; 500 501 c1 = 0; 502 c2 = 0; 503 c3 = 0; 504 mul_add_c(a[0], b[0], c1, c2, c3); 505 r[0] = c1; 506 c1 = 0; 507 mul_add_c(a[0], b[1], c2, c3, c1); 508 mul_add_c(a[1], b[0], c2, c3, c1); 509 r[1] = c2; 510 c2 = 0; 511 mul_add_c(a[2], b[0], c3, c1, c2); 512 mul_add_c(a[1], b[1], c3, c1, c2); 513 mul_add_c(a[0], b[2], c3, c1, c2); 514 r[2] = c3; 515 c3 = 0; 516 mul_add_c(a[0], b[3], c1, c2, c3); 517 mul_add_c(a[1], b[2], c1, c2, c3); 518 mul_add_c(a[2], b[1], c1, c2, c3); 519 mul_add_c(a[3], b[0], c1, c2, c3); 520 r[3] = c1; 521 c1 = 0; 522 mul_add_c(a[3], b[1], c2, c3, c1); 523 mul_add_c(a[2], b[2], c2, c3, c1); 524 mul_add_c(a[1], b[3], c2, c3, c1); 525 r[4] = c2; 526 c2 = 0; 527 mul_add_c(a[2], b[3], c3, c1, c2); 528 mul_add_c(a[3], b[2], c3, c1, c2); 529 r[5] = c3; 530 c3 = 0; 531 mul_add_c(a[3], b[3], c1, c2, c3); 532 r[6] = c1; 533 r[7] = c2; 534 } 535 536 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 537 { 538 BN_ULONG c1, c2, c3; 539 540 c1 = 0; 541 c2 = 0; 542 c3 = 0; 543 sqr_add_c(a, 0, c1, c2, c3); 544 r[0] = c1; 545 c1 = 0; 546 sqr_add_c2(a, 1, 0, c2, c3, c1); 547 r[1] = c2; 548 c2 = 0; 549 sqr_add_c(a, 1, c3, c1, c2); 550 sqr_add_c2(a, 2, 0, c3, c1, c2); 551 r[2] = c3; 552 c3 = 0; 553 sqr_add_c2(a, 3, 0, c1, c2, c3); 554 sqr_add_c2(a, 2, 1, c1, c2, c3); 555 r[3] = c1; 556 c1 = 0; 557 sqr_add_c(a, 2, c2, c3, c1); 558 sqr_add_c2(a, 3, 1, c2, c3, c1); 559 sqr_add_c2(a, 4, 0, c2, c3, c1); 560 r[4] = c2; 561 c2 = 0; 562 sqr_add_c2(a, 5, 0, c3, c1, c2); 563 sqr_add_c2(a, 4, 1, c3, c1, c2); 564 sqr_add_c2(a, 3, 2, c3, c1, c2); 565 r[5] = c3; 566 c3 = 0; 567 sqr_add_c(a, 3, c1, c2, c3); 568 sqr_add_c2(a, 4, 2, c1, c2, c3); 569 sqr_add_c2(a, 5, 1, c1, c2, c3); 570 sqr_add_c2(a, 6, 0, c1, c2, c3); 571 r[6] = c1; 572 c1 = 0; 573 sqr_add_c2(a, 7, 0, c2, c3, c1); 574 sqr_add_c2(a, 6, 1, c2, c3, c1); 575 sqr_add_c2(a, 5, 2, c2, c3, c1); 576 sqr_add_c2(a, 4, 3, c2, c3, c1); 577 r[7] = c2; 578 c2 = 0; 579 sqr_add_c(a, 4, c3, c1, c2); 580 sqr_add_c2(a, 5, 3, c3, c1, c2); 581 sqr_add_c2(a, 6, 2, c3, c1, c2); 582 sqr_add_c2(a, 7, 1, c3, c1, c2); 583 r[8] = c3; 584 c3 = 0; 585 sqr_add_c2(a, 7, 2, c1, c2, c3); 586 sqr_add_c2(a, 6, 3, c1, c2, c3); 587 sqr_add_c2(a, 5, 4, c1, c2, c3); 588 r[9] = c1; 589 c1 = 0; 590 sqr_add_c(a, 5, c2, c3, c1); 591 sqr_add_c2(a, 6, 4, c2, c3, c1); 592 sqr_add_c2(a, 7, 3, c2, c3, c1); 593 r[10] = c2; 594 c2 = 0; 595 sqr_add_c2(a, 7, 4, c3, c1, c2); 596 sqr_add_c2(a, 6, 5, c3, c1, c2); 597 r[11] = c3; 598 c3 = 0; 599 sqr_add_c(a, 6, c1, c2, c3); 600 sqr_add_c2(a, 7, 5, c1, c2, c3); 601 r[12] = c1; 602 c1 = 0; 603 sqr_add_c2(a, 7, 6, c2, c3, c1); 604 r[13] = c2; 605 c2 = 0; 606 sqr_add_c(a, 7, c3, c1, c2); 607 r[14] = c3; 608 r[15] = c1; 609 } 610 611 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 612 { 613 BN_ULONG c1, c2, c3; 614 615 c1 = 0; 616 c2 = 0; 617 c3 = 0; 618 sqr_add_c(a, 0, c1, c2, c3); 619 r[0] = c1; 620 c1 = 0; 621 sqr_add_c2(a, 1, 0, c2, c3, c1); 622 r[1] = c2; 623 c2 = 0; 624 sqr_add_c(a, 1, c3, c1, c2); 625 sqr_add_c2(a, 2, 0, c3, c1, c2); 626 r[2] = c3; 627 c3 = 0; 628 sqr_add_c2(a, 3, 0, c1, c2, c3); 629 sqr_add_c2(a, 2, 1, c1, c2, c3); 630 r[3] = c1; 631 c1 = 0; 632 sqr_add_c(a, 2, c2, c3, c1); 633 sqr_add_c2(a, 3, 1, c2, c3, c1); 634 r[4] = c2; 635 c2 = 0; 636 sqr_add_c2(a, 3, 2, c3, c1, c2); 637 r[5] = c3; 638 c3 = 0; 639 sqr_add_c(a, 3, c1, c2, c3); 640 r[6] = c1; 641 r[7] = c2; 642 } 643 #endif 644