1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# October 2005. 11# 12# Montgomery multiplication routine for x86_64. While it gives modest 13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more 14# than twice, >2x, as fast. Most common rsa1024 sign is improved by 15# respectful 50%. It remains to be seen if loop unrolling and 16# dedicated squaring routine can provide further improvement... 17 18# July 2011. 19# 20# Add dedicated squaring procedure. Performance improvement varies 21# from platform to platform, but in average it's ~5%/15%/25%/33% 22# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 23 24# August 2011. 25# 26# Unroll and modulo-schedule inner loops in such manner that they 27# are "fallen through" for input lengths of 8, which is critical for 28# 1024-bit RSA *sign*. Average performance improvement in comparison 29# to *initial* version of this module from 2005 is ~0%/30%/40%/45% 30# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 31 32$flavour = shift; 33$output = shift; 34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 35 36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 39die "can't locate x86_64-xlate.pl"; 40 41open OUT,"| \"$^X\" $xlate $flavour $output"; 42*STDOUT=*OUT; 43 44# int bn_mul_mont( 45$rp="%rdi"; # BN_ULONG *rp, 46$ap="%rsi"; # const BN_ULONG *ap, 47$bp="%rdx"; # const BN_ULONG *bp, 48$np="%rcx"; # const BN_ULONG *np, 49$n0="%r8"; # const BN_ULONG *n0, 50$num="%r9"; # int num); 51$lo0="%r10"; 52$hi0="%r11"; 53$hi1="%r13"; 54$i="%r14"; 55$j="%r15"; 56$m0="%rbx"; 57$m1="%rbp"; 58 59$code=<<___; 60.text 61 62.globl bn_mul_mont 63.type bn_mul_mont,\@function,6 64.align 16 65bn_mul_mont: 66 test \$3,${num}d 67 jnz .Lmul_enter 68 cmp \$8,${num}d 69 jb .Lmul_enter 70 cmp $ap,$bp 71 jne .Lmul4x_enter 72 jmp .Lsqr4x_enter 73 74.align 16 75.Lmul_enter: 76 push %rbx 77 push %rbp 78 push %r12 79 push %r13 80 push %r14 81 push %r15 82 83 mov ${num}d,${num}d 84 lea 2($num),%r10 85 mov %rsp,%r11 86 neg %r10 87 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2)) 88 and \$-1024,%rsp # minimize TLB usage 89 90 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp 91.Lmul_body: 92 mov $bp,%r12 # reassign $bp 93___ 94 $bp="%r12"; 95$code.=<<___; 96 mov ($n0),$n0 # pull n0[0] value 97 mov ($bp),$m0 # m0=bp[0] 98 mov ($ap),%rax 99 100 xor $i,$i # i=0 101 xor $j,$j # j=0 102 103 mov $n0,$m1 104 mulq $m0 # ap[0]*bp[0] 105 mov %rax,$lo0 106 mov ($np),%rax 107 108 imulq $lo0,$m1 # "tp[0]"*n0 109 mov %rdx,$hi0 110 111 mulq $m1 # np[0]*m1 112 add %rax,$lo0 # discarded 113 mov 8($ap),%rax 114 adc \$0,%rdx 115 mov %rdx,$hi1 116 117 lea 1($j),$j # j++ 118 jmp .L1st_enter 119 120.align 16 121.L1st: 122 add %rax,$hi1 123 mov ($ap,$j,8),%rax 124 adc \$0,%rdx 125 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 126 mov $lo0,$hi0 127 adc \$0,%rdx 128 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 129 mov %rdx,$hi1 130 131.L1st_enter: 132 mulq $m0 # ap[j]*bp[0] 133 add %rax,$hi0 134 mov ($np,$j,8),%rax 135 adc \$0,%rdx 136 lea 1($j),$j # j++ 137 mov %rdx,$lo0 138 139 mulq $m1 # np[j]*m1 140 cmp $num,$j 141 jl .L1st 142 143 add %rax,$hi1 144 mov ($ap),%rax # ap[0] 145 adc \$0,%rdx 146 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 147 adc \$0,%rdx 148 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 149 mov %rdx,$hi1 150 mov $lo0,$hi0 151 152 xor %rdx,%rdx 153 add $hi0,$hi1 154 adc \$0,%rdx 155 mov $hi1,-8(%rsp,$num,8) 156 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 157 158 lea 1($i),$i # i++ 159 jmp .Louter 160.align 16 161.Louter: 162 mov ($bp,$i,8),$m0 # m0=bp[i] 163 xor $j,$j # j=0 164 mov $n0,$m1 165 mov (%rsp),$lo0 166 mulq $m0 # ap[0]*bp[i] 167 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 168 mov ($np),%rax 169 adc \$0,%rdx 170 171 imulq $lo0,$m1 # tp[0]*n0 172 mov %rdx,$hi0 173 174 mulq $m1 # np[0]*m1 175 add %rax,$lo0 # discarded 176 mov 8($ap),%rax 177 adc \$0,%rdx 178 mov 8(%rsp),$lo0 # tp[1] 179 mov %rdx,$hi1 180 181 lea 1($j),$j # j++ 182 jmp .Linner_enter 183 184.align 16 185.Linner: 186 add %rax,$hi1 187 mov ($ap,$j,8),%rax 188 adc \$0,%rdx 189 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 190 mov (%rsp,$j,8),$lo0 191 adc \$0,%rdx 192 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 193 mov %rdx,$hi1 194 195.Linner_enter: 196 mulq $m0 # ap[j]*bp[i] 197 add %rax,$hi0 198 mov ($np,$j,8),%rax 199 adc \$0,%rdx 200 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 201 mov %rdx,$hi0 202 adc \$0,$hi0 203 lea 1($j),$j # j++ 204 205 mulq $m1 # np[j]*m1 206 cmp $num,$j 207 jl .Linner 208 209 add %rax,$hi1 210 mov ($ap),%rax # ap[0] 211 adc \$0,%rdx 212 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 213 mov (%rsp,$j,8),$lo0 214 adc \$0,%rdx 215 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 216 mov %rdx,$hi1 217 218 xor %rdx,%rdx 219 add $hi0,$hi1 220 adc \$0,%rdx 221 add $lo0,$hi1 # pull upmost overflow bit 222 adc \$0,%rdx 223 mov $hi1,-8(%rsp,$num,8) 224 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 225 226 lea 1($i),$i # i++ 227 cmp $num,$i 228 jl .Louter 229 230 xor $i,$i # i=0 and clear CF! 231 mov (%rsp),%rax # tp[0] 232 lea (%rsp),$ap # borrow ap for tp 233 mov $num,$j # j=num 234 jmp .Lsub 235.align 16 236.Lsub: sbb ($np,$i,8),%rax 237 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 238 mov 8($ap,$i,8),%rax # tp[i+1] 239 lea 1($i),$i # i++ 240 dec $j # doesnn't affect CF! 241 jnz .Lsub 242 243 sbb \$0,%rax # handle upmost overflow bit 244 xor $i,$i 245 and %rax,$ap 246 not %rax 247 mov $rp,$np 248 and %rax,$np 249 mov $num,$j # j=num 250 or $np,$ap # ap=borrow?tp:rp 251.align 16 252.Lcopy: # copy or in-place refresh 253 mov ($ap,$i,8),%rax 254 mov $i,(%rsp,$i,8) # zap temporary vector 255 mov %rax,($rp,$i,8) # rp[i]=tp[i] 256 lea 1($i),$i 257 sub \$1,$j 258 jnz .Lcopy 259 260 mov 8(%rsp,$num,8),%rsi # restore %rsp 261 mov \$1,%rax 262 mov (%rsi),%r15 263 mov 8(%rsi),%r14 264 mov 16(%rsi),%r13 265 mov 24(%rsi),%r12 266 mov 32(%rsi),%rbp 267 mov 40(%rsi),%rbx 268 lea 48(%rsi),%rsp 269.Lmul_epilogue: 270 ret 271.size bn_mul_mont,.-bn_mul_mont 272___ 273{{{ 274my @A=("%r10","%r11"); 275my @N=("%r13","%rdi"); 276$code.=<<___; 277.type bn_mul4x_mont,\@function,6 278.align 16 279bn_mul4x_mont: 280.Lmul4x_enter: 281 push %rbx 282 push %rbp 283 push %r12 284 push %r13 285 push %r14 286 push %r15 287 288 mov ${num}d,${num}d 289 lea 4($num),%r10 290 mov %rsp,%r11 291 neg %r10 292 lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4)) 293 and \$-1024,%rsp # minimize TLB usage 294 295 mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp 296.Lmul4x_body: 297 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp 298 mov %rdx,%r12 # reassign $bp 299___ 300 $bp="%r12"; 301$code.=<<___; 302 mov ($n0),$n0 # pull n0[0] value 303 mov ($bp),$m0 # m0=bp[0] 304 mov ($ap),%rax 305 306 xor $i,$i # i=0 307 xor $j,$j # j=0 308 309 mov $n0,$m1 310 mulq $m0 # ap[0]*bp[0] 311 mov %rax,$A[0] 312 mov ($np),%rax 313 314 imulq $A[0],$m1 # "tp[0]"*n0 315 mov %rdx,$A[1] 316 317 mulq $m1 # np[0]*m1 318 add %rax,$A[0] # discarded 319 mov 8($ap),%rax 320 adc \$0,%rdx 321 mov %rdx,$N[1] 322 323 mulq $m0 324 add %rax,$A[1] 325 mov 8($np),%rax 326 adc \$0,%rdx 327 mov %rdx,$A[0] 328 329 mulq $m1 330 add %rax,$N[1] 331 mov 16($ap),%rax 332 adc \$0,%rdx 333 add $A[1],$N[1] 334 lea 4($j),$j # j++ 335 adc \$0,%rdx 336 mov $N[1],(%rsp) 337 mov %rdx,$N[0] 338 jmp .L1st4x 339.align 16 340.L1st4x: 341 mulq $m0 # ap[j]*bp[0] 342 add %rax,$A[0] 343 mov -16($np,$j,8),%rax 344 adc \$0,%rdx 345 mov %rdx,$A[1] 346 347 mulq $m1 # np[j]*m1 348 add %rax,$N[0] 349 mov -8($ap,$j,8),%rax 350 adc \$0,%rdx 351 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 352 adc \$0,%rdx 353 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 354 mov %rdx,$N[1] 355 356 mulq $m0 # ap[j]*bp[0] 357 add %rax,$A[1] 358 mov -8($np,$j,8),%rax 359 adc \$0,%rdx 360 mov %rdx,$A[0] 361 362 mulq $m1 # np[j]*m1 363 add %rax,$N[1] 364 mov ($ap,$j,8),%rax 365 adc \$0,%rdx 366 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 367 adc \$0,%rdx 368 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 369 mov %rdx,$N[0] 370 371 mulq $m0 # ap[j]*bp[0] 372 add %rax,$A[0] 373 mov ($np,$j,8),%rax 374 adc \$0,%rdx 375 mov %rdx,$A[1] 376 377 mulq $m1 # np[j]*m1 378 add %rax,$N[0] 379 mov 8($ap,$j,8),%rax 380 adc \$0,%rdx 381 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 382 adc \$0,%rdx 383 mov $N[0],-8(%rsp,$j,8) # tp[j-1] 384 mov %rdx,$N[1] 385 386 mulq $m0 # ap[j]*bp[0] 387 add %rax,$A[1] 388 mov 8($np,$j,8),%rax 389 adc \$0,%rdx 390 lea 4($j),$j # j++ 391 mov %rdx,$A[0] 392 393 mulq $m1 # np[j]*m1 394 add %rax,$N[1] 395 mov -16($ap,$j,8),%rax 396 adc \$0,%rdx 397 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 398 adc \$0,%rdx 399 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 400 mov %rdx,$N[0] 401 cmp $num,$j 402 jl .L1st4x 403 404 mulq $m0 # ap[j]*bp[0] 405 add %rax,$A[0] 406 mov -16($np,$j,8),%rax 407 adc \$0,%rdx 408 mov %rdx,$A[1] 409 410 mulq $m1 # np[j]*m1 411 add %rax,$N[0] 412 mov -8($ap,$j,8),%rax 413 adc \$0,%rdx 414 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 415 adc \$0,%rdx 416 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 417 mov %rdx,$N[1] 418 419 mulq $m0 # ap[j]*bp[0] 420 add %rax,$A[1] 421 mov -8($np,$j,8),%rax 422 adc \$0,%rdx 423 mov %rdx,$A[0] 424 425 mulq $m1 # np[j]*m1 426 add %rax,$N[1] 427 mov ($ap),%rax # ap[0] 428 adc \$0,%rdx 429 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 430 adc \$0,%rdx 431 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 432 mov %rdx,$N[0] 433 434 xor $N[1],$N[1] 435 add $A[0],$N[0] 436 adc \$0,$N[1] 437 mov $N[0],-8(%rsp,$j,8) 438 mov $N[1],(%rsp,$j,8) # store upmost overflow bit 439 440 lea 1($i),$i # i++ 441.align 4 442.Louter4x: 443 mov ($bp,$i,8),$m0 # m0=bp[i] 444 xor $j,$j # j=0 445 mov (%rsp),$A[0] 446 mov $n0,$m1 447 mulq $m0 # ap[0]*bp[i] 448 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 449 mov ($np),%rax 450 adc \$0,%rdx 451 452 imulq $A[0],$m1 # tp[0]*n0 453 mov %rdx,$A[1] 454 455 mulq $m1 # np[0]*m1 456 add %rax,$A[0] # "$N[0]", discarded 457 mov 8($ap),%rax 458 adc \$0,%rdx 459 mov %rdx,$N[1] 460 461 mulq $m0 # ap[j]*bp[i] 462 add %rax,$A[1] 463 mov 8($np),%rax 464 adc \$0,%rdx 465 add 8(%rsp),$A[1] # +tp[1] 466 adc \$0,%rdx 467 mov %rdx,$A[0] 468 469 mulq $m1 # np[j]*m1 470 add %rax,$N[1] 471 mov 16($ap),%rax 472 adc \$0,%rdx 473 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 474 lea 4($j),$j # j+=2 475 adc \$0,%rdx 476 mov $N[1],(%rsp) # tp[j-1] 477 mov %rdx,$N[0] 478 jmp .Linner4x 479.align 16 480.Linner4x: 481 mulq $m0 # ap[j]*bp[i] 482 add %rax,$A[0] 483 mov -16($np,$j,8),%rax 484 adc \$0,%rdx 485 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 486 adc \$0,%rdx 487 mov %rdx,$A[1] 488 489 mulq $m1 # np[j]*m1 490 add %rax,$N[0] 491 mov -8($ap,$j,8),%rax 492 adc \$0,%rdx 493 add $A[0],$N[0] 494 adc \$0,%rdx 495 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 496 mov %rdx,$N[1] 497 498 mulq $m0 # ap[j]*bp[i] 499 add %rax,$A[1] 500 mov -8($np,$j,8),%rax 501 adc \$0,%rdx 502 add -8(%rsp,$j,8),$A[1] 503 adc \$0,%rdx 504 mov %rdx,$A[0] 505 506 mulq $m1 # np[j]*m1 507 add %rax,$N[1] 508 mov ($ap,$j,8),%rax 509 adc \$0,%rdx 510 add $A[1],$N[1] 511 adc \$0,%rdx 512 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 513 mov %rdx,$N[0] 514 515 mulq $m0 # ap[j]*bp[i] 516 add %rax,$A[0] 517 mov ($np,$j,8),%rax 518 adc \$0,%rdx 519 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 520 adc \$0,%rdx 521 mov %rdx,$A[1] 522 523 mulq $m1 # np[j]*m1 524 add %rax,$N[0] 525 mov 8($ap,$j,8),%rax 526 adc \$0,%rdx 527 add $A[0],$N[0] 528 adc \$0,%rdx 529 mov $N[0],-8(%rsp,$j,8) # tp[j-1] 530 mov %rdx,$N[1] 531 532 mulq $m0 # ap[j]*bp[i] 533 add %rax,$A[1] 534 mov 8($np,$j,8),%rax 535 adc \$0,%rdx 536 add 8(%rsp,$j,8),$A[1] 537 adc \$0,%rdx 538 lea 4($j),$j # j++ 539 mov %rdx,$A[0] 540 541 mulq $m1 # np[j]*m1 542 add %rax,$N[1] 543 mov -16($ap,$j,8),%rax 544 adc \$0,%rdx 545 add $A[1],$N[1] 546 adc \$0,%rdx 547 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 548 mov %rdx,$N[0] 549 cmp $num,$j 550 jl .Linner4x 551 552 mulq $m0 # ap[j]*bp[i] 553 add %rax,$A[0] 554 mov -16($np,$j,8),%rax 555 adc \$0,%rdx 556 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 557 adc \$0,%rdx 558 mov %rdx,$A[1] 559 560 mulq $m1 # np[j]*m1 561 add %rax,$N[0] 562 mov -8($ap,$j,8),%rax 563 adc \$0,%rdx 564 add $A[0],$N[0] 565 adc \$0,%rdx 566 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 567 mov %rdx,$N[1] 568 569 mulq $m0 # ap[j]*bp[i] 570 add %rax,$A[1] 571 mov -8($np,$j,8),%rax 572 adc \$0,%rdx 573 add -8(%rsp,$j,8),$A[1] 574 adc \$0,%rdx 575 lea 1($i),$i # i++ 576 mov %rdx,$A[0] 577 578 mulq $m1 # np[j]*m1 579 add %rax,$N[1] 580 mov ($ap),%rax # ap[0] 581 adc \$0,%rdx 582 add $A[1],$N[1] 583 adc \$0,%rdx 584 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 585 mov %rdx,$N[0] 586 587 xor $N[1],$N[1] 588 add $A[0],$N[0] 589 adc \$0,$N[1] 590 add (%rsp,$num,8),$N[0] # pull upmost overflow bit 591 adc \$0,$N[1] 592 mov $N[0],-8(%rsp,$j,8) 593 mov $N[1],(%rsp,$j,8) # store upmost overflow bit 594 595 cmp $num,$i 596 jl .Louter4x 597___ 598{ 599my @ri=("%rax","%rdx",$m0,$m1); 600$code.=<<___; 601 mov 16(%rsp,$num,8),$rp # restore $rp 602 mov 0(%rsp),@ri[0] # tp[0] 603 pxor %xmm0,%xmm0 604 mov 8(%rsp),@ri[1] # tp[1] 605 shr \$2,$num # num/=4 606 lea (%rsp),$ap # borrow ap for tp 607 xor $i,$i # i=0 and clear CF! 608 609 sub 0($np),@ri[0] 610 mov 16($ap),@ri[2] # tp[2] 611 mov 24($ap),@ri[3] # tp[3] 612 sbb 8($np),@ri[1] 613 lea -1($num),$j # j=num/4-1 614 jmp .Lsub4x 615.align 16 616.Lsub4x: 617 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 618 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 619 sbb 16($np,$i,8),@ri[2] 620 mov 32($ap,$i,8),@ri[0] # tp[i+1] 621 mov 40($ap,$i,8),@ri[1] 622 sbb 24($np,$i,8),@ri[3] 623 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 624 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 625 sbb 32($np,$i,8),@ri[0] 626 mov 48($ap,$i,8),@ri[2] 627 mov 56($ap,$i,8),@ri[3] 628 sbb 40($np,$i,8),@ri[1] 629 lea 4($i),$i # i++ 630 dec $j # doesnn't affect CF! 631 jnz .Lsub4x 632 633 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 634 mov 32($ap,$i,8),@ri[0] # load overflow bit 635 sbb 16($np,$i,8),@ri[2] 636 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 637 sbb 24($np,$i,8),@ri[3] 638 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 639 640 sbb \$0,@ri[0] # handle upmost overflow bit 641 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 642 xor $i,$i # i=0 643 and @ri[0],$ap 644 not @ri[0] 645 mov $rp,$np 646 and @ri[0],$np 647 lea -1($num),$j 648 or $np,$ap # ap=borrow?tp:rp 649 650 movdqu ($ap),%xmm1 651 movdqa %xmm0,(%rsp) 652 movdqu %xmm1,($rp) 653 jmp .Lcopy4x 654.align 16 655.Lcopy4x: # copy or in-place refresh 656 movdqu 16($ap,$i),%xmm2 657 movdqu 32($ap,$i),%xmm1 658 movdqa %xmm0,16(%rsp,$i) 659 movdqu %xmm2,16($rp,$i) 660 movdqa %xmm0,32(%rsp,$i) 661 movdqu %xmm1,32($rp,$i) 662 lea 32($i),$i 663 dec $j 664 jnz .Lcopy4x 665 666 shl \$2,$num 667 movdqu 16($ap,$i),%xmm2 668 movdqa %xmm0,16(%rsp,$i) 669 movdqu %xmm2,16($rp,$i) 670___ 671} 672$code.=<<___; 673 mov 8(%rsp,$num,8),%rsi # restore %rsp 674 mov \$1,%rax 675 mov (%rsi),%r15 676 mov 8(%rsi),%r14 677 mov 16(%rsi),%r13 678 mov 24(%rsi),%r12 679 mov 32(%rsi),%rbp 680 mov 40(%rsi),%rbx 681 lea 48(%rsi),%rsp 682.Lmul4x_epilogue: 683 ret 684.size bn_mul4x_mont,.-bn_mul4x_mont 685___ 686}}} 687{{{ 688###################################################################### 689# void bn_sqr4x_mont( 690my $rptr="%rdi"; # const BN_ULONG *rptr, 691my $aptr="%rsi"; # const BN_ULONG *aptr, 692my $bptr="%rdx"; # not used 693my $nptr="%rcx"; # const BN_ULONG *nptr, 694my $n0 ="%r8"; # const BN_ULONG *n0); 695my $num ="%r9"; # int num, has to be divisible by 4 and 696 # not less than 8 697 698my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 699my @A0=("%r10","%r11"); 700my @A1=("%r12","%r13"); 701my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 702 703$code.=<<___; 704.type bn_sqr4x_mont,\@function,6 705.align 16 706bn_sqr4x_mont: 707.Lsqr4x_enter: 708 push %rbx 709 push %rbp 710 push %r12 711 push %r13 712 push %r14 713 push %r15 714 715 shl \$3,${num}d # convert $num to bytes 716 xor %r10,%r10 717 mov %rsp,%r11 # put aside %rsp 718 sub $num,%r10 # -$num 719 mov ($n0),$n0 # *n0 720 lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num) 721 and \$-1024,%rsp # minimize TLB usage 722 ############################################################## 723 # Stack layout 724 # 725 # +0 saved $num, used in reduction section 726 # +8 &t[2*$num], used in reduction section 727 # +32 saved $rptr 728 # +40 saved $nptr 729 # +48 saved *n0 730 # +56 saved %rsp 731 # +64 t[2*$num] 732 # 733 mov $rptr,32(%rsp) # save $rptr 734 mov $nptr,40(%rsp) 735 mov $n0, 48(%rsp) 736 mov %r11, 56(%rsp) # save original %rsp 737.Lsqr4x_body: 738 ############################################################## 739 # Squaring part: 740 # 741 # a) multiply-n-add everything but a[i]*a[i]; 742 # b) shift result of a) by 1 to the left and accumulate 743 # a[i]*a[i] products; 744 # 745 lea 32(%r10),$i # $i=-($num-32) 746 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 747 748 mov $num,$j # $j=$num 749 750 # comments apply to $num==8 case 751 mov -32($aptr,$i),$a0 # a[0] 752 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 753 mov -24($aptr,$i),%rax # a[1] 754 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 755 mov -16($aptr,$i),$ai # a[2] 756 mov %rax,$a1 757 758 mul $a0 # a[1]*a[0] 759 mov %rax,$A0[0] # a[1]*a[0] 760 mov $ai,%rax # a[2] 761 mov %rdx,$A0[1] 762 mov $A0[0],-24($tptr,$i) # t[1] 763 764 xor $A0[0],$A0[0] 765 mul $a0 # a[2]*a[0] 766 add %rax,$A0[1] 767 mov $ai,%rax 768 adc %rdx,$A0[0] 769 mov $A0[1],-16($tptr,$i) # t[2] 770 771 lea -16($i),$j # j=-16 772 773 774 mov 8($aptr,$j),$ai # a[3] 775 mul $a1 # a[2]*a[1] 776 mov %rax,$A1[0] # a[2]*a[1]+t[3] 777 mov $ai,%rax 778 mov %rdx,$A1[1] 779 780 xor $A0[1],$A0[1] 781 add $A1[0],$A0[0] 782 lea 16($j),$j 783 adc \$0,$A0[1] 784 mul $a0 # a[3]*a[0] 785 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 786 mov $ai,%rax 787 adc %rdx,$A0[1] 788 mov $A0[0],-8($tptr,$j) # t[3] 789 jmp .Lsqr4x_1st 790 791.align 16 792.Lsqr4x_1st: 793 mov ($aptr,$j),$ai # a[4] 794 xor $A1[0],$A1[0] 795 mul $a1 # a[3]*a[1] 796 add %rax,$A1[1] # a[3]*a[1]+t[4] 797 mov $ai,%rax 798 adc %rdx,$A1[0] 799 800 xor $A0[0],$A0[0] 801 add $A1[1],$A0[1] 802 adc \$0,$A0[0] 803 mul $a0 # a[4]*a[0] 804 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 805 mov $ai,%rax # a[3] 806 adc %rdx,$A0[0] 807 mov $A0[1],($tptr,$j) # t[4] 808 809 810 mov 8($aptr,$j),$ai # a[5] 811 xor $A1[1],$A1[1] 812 mul $a1 # a[4]*a[3] 813 add %rax,$A1[0] # a[4]*a[3]+t[5] 814 mov $ai,%rax 815 adc %rdx,$A1[1] 816 817 xor $A0[1],$A0[1] 818 add $A1[0],$A0[0] 819 adc \$0,$A0[1] 820 mul $a0 # a[5]*a[2] 821 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 822 mov $ai,%rax 823 adc %rdx,$A0[1] 824 mov $A0[0],8($tptr,$j) # t[5] 825 826 mov 16($aptr,$j),$ai # a[6] 827 xor $A1[0],$A1[0] 828 mul $a1 # a[5]*a[3] 829 add %rax,$A1[1] # a[5]*a[3]+t[6] 830 mov $ai,%rax 831 adc %rdx,$A1[0] 832 833 xor $A0[0],$A0[0] 834 add $A1[1],$A0[1] 835 adc \$0,$A0[0] 836 mul $a0 # a[6]*a[2] 837 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 838 mov $ai,%rax # a[3] 839 adc %rdx,$A0[0] 840 mov $A0[1],16($tptr,$j) # t[6] 841 842 843 mov 24($aptr,$j),$ai # a[7] 844 xor $A1[1],$A1[1] 845 mul $a1 # a[6]*a[5] 846 add %rax,$A1[0] # a[6]*a[5]+t[7] 847 mov $ai,%rax 848 adc %rdx,$A1[1] 849 850 xor $A0[1],$A0[1] 851 add $A1[0],$A0[0] 852 lea 32($j),$j 853 adc \$0,$A0[1] 854 mul $a0 # a[7]*a[4] 855 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 856 mov $ai,%rax 857 adc %rdx,$A0[1] 858 mov $A0[0],-8($tptr,$j) # t[7] 859 860 cmp \$0,$j 861 jne .Lsqr4x_1st 862 863 xor $A1[0],$A1[0] 864 add $A0[1],$A1[1] 865 adc \$0,$A1[0] 866 mul $a1 # a[7]*a[5] 867 add %rax,$A1[1] 868 adc %rdx,$A1[0] 869 870 mov $A1[1],($tptr) # t[8] 871 lea 16($i),$i 872 mov $A1[0],8($tptr) # t[9] 873 jmp .Lsqr4x_outer 874 875.align 16 876.Lsqr4x_outer: # comments apply to $num==6 case 877 mov -32($aptr,$i),$a0 # a[0] 878 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 879 mov -24($aptr,$i),%rax # a[1] 880 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 881 mov -16($aptr,$i),$ai # a[2] 882 mov %rax,$a1 883 884 mov -24($tptr,$i),$A0[0] # t[1] 885 xor $A0[1],$A0[1] 886 mul $a0 # a[1]*a[0] 887 add %rax,$A0[0] # a[1]*a[0]+t[1] 888 mov $ai,%rax # a[2] 889 adc %rdx,$A0[1] 890 mov $A0[0],-24($tptr,$i) # t[1] 891 892 xor $A0[0],$A0[0] 893 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 894 adc \$0,$A0[0] 895 mul $a0 # a[2]*a[0] 896 add %rax,$A0[1] 897 mov $ai,%rax 898 adc %rdx,$A0[0] 899 mov $A0[1],-16($tptr,$i) # t[2] 900 901 lea -16($i),$j # j=-16 902 xor $A1[0],$A1[0] 903 904 905 mov 8($aptr,$j),$ai # a[3] 906 xor $A1[1],$A1[1] 907 add 8($tptr,$j),$A1[0] 908 adc \$0,$A1[1] 909 mul $a1 # a[2]*a[1] 910 add %rax,$A1[0] # a[2]*a[1]+t[3] 911 mov $ai,%rax 912 adc %rdx,$A1[1] 913 914 xor $A0[1],$A0[1] 915 add $A1[0],$A0[0] 916 adc \$0,$A0[1] 917 mul $a0 # a[3]*a[0] 918 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 919 mov $ai,%rax 920 adc %rdx,$A0[1] 921 mov $A0[0],8($tptr,$j) # t[3] 922 923 lea 16($j),$j 924 jmp .Lsqr4x_inner 925 926.align 16 927.Lsqr4x_inner: 928 mov ($aptr,$j),$ai # a[4] 929 xor $A1[0],$A1[0] 930 add ($tptr,$j),$A1[1] 931 adc \$0,$A1[0] 932 mul $a1 # a[3]*a[1] 933 add %rax,$A1[1] # a[3]*a[1]+t[4] 934 mov $ai,%rax 935 adc %rdx,$A1[0] 936 937 xor $A0[0],$A0[0] 938 add $A1[1],$A0[1] 939 adc \$0,$A0[0] 940 mul $a0 # a[4]*a[0] 941 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 942 mov $ai,%rax # a[3] 943 adc %rdx,$A0[0] 944 mov $A0[1],($tptr,$j) # t[4] 945 946 mov 8($aptr,$j),$ai # a[5] 947 xor $A1[1],$A1[1] 948 add 8($tptr,$j),$A1[0] 949 adc \$0,$A1[1] 950 mul $a1 # a[4]*a[3] 951 add %rax,$A1[0] # a[4]*a[3]+t[5] 952 mov $ai,%rax 953 adc %rdx,$A1[1] 954 955 xor $A0[1],$A0[1] 956 add $A1[0],$A0[0] 957 lea 16($j),$j # j++ 958 adc \$0,$A0[1] 959 mul $a0 # a[5]*a[2] 960 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 961 mov $ai,%rax 962 adc %rdx,$A0[1] 963 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 964 965 cmp \$0,$j 966 jne .Lsqr4x_inner 967 968 xor $A1[0],$A1[0] 969 add $A0[1],$A1[1] 970 adc \$0,$A1[0] 971 mul $a1 # a[5]*a[3] 972 add %rax,$A1[1] 973 adc %rdx,$A1[0] 974 975 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 976 mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below 977 978 add \$16,$i 979 jnz .Lsqr4x_outer 980 981 # comments apply to $num==4 case 982 mov -32($aptr),$a0 # a[0] 983 lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 984 mov -24($aptr),%rax # a[1] 985 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 986 mov -16($aptr),$ai # a[2] 987 mov %rax,$a1 988 989 xor $A0[1],$A0[1] 990 mul $a0 # a[1]*a[0] 991 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 992 mov $ai,%rax # a[2] 993 adc %rdx,$A0[1] 994 mov $A0[0],-24($tptr) # t[1] 995 996 xor $A0[0],$A0[0] 997 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 998 adc \$0,$A0[0] 999 mul $a0 # a[2]*a[0] 1000 add %rax,$A0[1] 1001 mov $ai,%rax 1002 adc %rdx,$A0[0] 1003 mov $A0[1],-16($tptr) # t[2] 1004 1005 mov -8($aptr),$ai # a[3] 1006 mul $a1 # a[2]*a[1] 1007 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1008 mov $ai,%rax 1009 adc \$0,%rdx 1010 1011 xor $A0[1],$A0[1] 1012 add $A1[0],$A0[0] 1013 mov %rdx,$A1[1] 1014 adc \$0,$A0[1] 1015 mul $a0 # a[3]*a[0] 1016 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1017 mov $ai,%rax 1018 adc %rdx,$A0[1] 1019 mov $A0[0],-8($tptr) # t[3] 1020 1021 xor $A1[0],$A1[0] 1022 add $A0[1],$A1[1] 1023 adc \$0,$A1[0] 1024 mul $a1 # a[3]*a[1] 1025 add %rax,$A1[1] 1026 mov -16($aptr),%rax # a[2] 1027 adc %rdx,$A1[0] 1028 1029 mov $A1[1],($tptr) # t[4] 1030 mov $A1[0],8($tptr) # t[5] 1031 1032 mul $ai # a[2]*a[3] 1033___ 1034{ 1035my ($shift,$carry)=($a0,$a1); 1036my @S=(@A1,$ai,$n0); 1037$code.=<<___; 1038 add \$16,$i 1039 xor $shift,$shift 1040 sub $num,$i # $i=16-$num 1041 xor $carry,$carry 1042 1043 add $A1[0],%rax # t[5] 1044 adc \$0,%rdx 1045 mov %rax,8($tptr) # t[5] 1046 mov %rdx,16($tptr) # t[6] 1047 mov $carry,24($tptr) # t[7] 1048 1049 mov -16($aptr,$i),%rax # a[0] 1050 lea 64(%rsp,$num,2),$tptr 1051 xor $A0[0],$A0[0] # t[0] 1052 mov -24($tptr,$i,2),$A0[1] # t[1] 1053 1054 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1055 shr \$63,$A0[0] 1056 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1057 shr \$63,$A0[1] 1058 or $A0[0],$S[1] # | t[2*i]>>63 1059 mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1060 mov $A0[1],$shift # shift=t[2*i+1]>>63 1061 mul %rax # a[i]*a[i] 1062 neg $carry # mov $carry,cf 1063 mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1064 adc %rax,$S[0] 1065 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1066 mov $S[0],-32($tptr,$i,2) 1067 adc %rdx,$S[1] 1068 1069 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1070 mov $S[1],-24($tptr,$i,2) 1071 sbb $carry,$carry # mov cf,$carry 1072 shr \$63,$A0[0] 1073 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1074 shr \$63,$A0[1] 1075 or $A0[0],$S[3] # | t[2*i]>>63 1076 mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1077 mov $A0[1],$shift # shift=t[2*i+1]>>63 1078 mul %rax # a[i]*a[i] 1079 neg $carry # mov $carry,cf 1080 mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1081 adc %rax,$S[2] 1082 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1083 mov $S[2],-16($tptr,$i,2) 1084 adc %rdx,$S[3] 1085 lea 16($i),$i 1086 mov $S[3],-40($tptr,$i,2) 1087 sbb $carry,$carry # mov cf,$carry 1088 jmp .Lsqr4x_shift_n_add 1089 1090.align 16 1091.Lsqr4x_shift_n_add: 1092 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1093 shr \$63,$A0[0] 1094 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1095 shr \$63,$A0[1] 1096 or $A0[0],$S[1] # | t[2*i]>>63 1097 mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1098 mov $A0[1],$shift # shift=t[2*i+1]>>63 1099 mul %rax # a[i]*a[i] 1100 neg $carry # mov $carry,cf 1101 mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1102 adc %rax,$S[0] 1103 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1104 mov $S[0],-32($tptr,$i,2) 1105 adc %rdx,$S[1] 1106 1107 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1108 mov $S[1],-24($tptr,$i,2) 1109 sbb $carry,$carry # mov cf,$carry 1110 shr \$63,$A0[0] 1111 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1112 shr \$63,$A0[1] 1113 or $A0[0],$S[3] # | t[2*i]>>63 1114 mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1115 mov $A0[1],$shift # shift=t[2*i+1]>>63 1116 mul %rax # a[i]*a[i] 1117 neg $carry # mov $carry,cf 1118 mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1119 adc %rax,$S[2] 1120 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1121 mov $S[2],-16($tptr,$i,2) 1122 adc %rdx,$S[3] 1123 1124 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1125 mov $S[3],-8($tptr,$i,2) 1126 sbb $carry,$carry # mov cf,$carry 1127 shr \$63,$A0[0] 1128 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1129 shr \$63,$A0[1] 1130 or $A0[0],$S[1] # | t[2*i]>>63 1131 mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1132 mov $A0[1],$shift # shift=t[2*i+1]>>63 1133 mul %rax # a[i]*a[i] 1134 neg $carry # mov $carry,cf 1135 mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1136 adc %rax,$S[0] 1137 mov 8($aptr,$i),%rax # a[i+1] # prefetch 1138 mov $S[0],0($tptr,$i,2) 1139 adc %rdx,$S[1] 1140 1141 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1142 mov $S[1],8($tptr,$i,2) 1143 sbb $carry,$carry # mov cf,$carry 1144 shr \$63,$A0[0] 1145 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1146 shr \$63,$A0[1] 1147 or $A0[0],$S[3] # | t[2*i]>>63 1148 mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch 1149 mov $A0[1],$shift # shift=t[2*i+1]>>63 1150 mul %rax # a[i]*a[i] 1151 neg $carry # mov $carry,cf 1152 mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch 1153 adc %rax,$S[2] 1154 mov 16($aptr,$i),%rax # a[i+1] # prefetch 1155 mov $S[2],16($tptr,$i,2) 1156 adc %rdx,$S[3] 1157 mov $S[3],24($tptr,$i,2) 1158 sbb $carry,$carry # mov cf,$carry 1159 add \$32,$i 1160 jnz .Lsqr4x_shift_n_add 1161 1162 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1163 shr \$63,$A0[0] 1164 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1165 shr \$63,$A0[1] 1166 or $A0[0],$S[1] # | t[2*i]>>63 1167 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1168 mov $A0[1],$shift # shift=t[2*i+1]>>63 1169 mul %rax # a[i]*a[i] 1170 neg $carry # mov $carry,cf 1171 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1172 adc %rax,$S[0] 1173 mov -8($aptr),%rax # a[i+1] # prefetch 1174 mov $S[0],-32($tptr) 1175 adc %rdx,$S[1] 1176 1177 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1178 mov $S[1],-24($tptr) 1179 sbb $carry,$carry # mov cf,$carry 1180 shr \$63,$A0[0] 1181 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1182 shr \$63,$A0[1] 1183 or $A0[0],$S[3] # | t[2*i]>>63 1184 mul %rax # a[i]*a[i] 1185 neg $carry # mov $carry,cf 1186 adc %rax,$S[2] 1187 adc %rdx,$S[3] 1188 mov $S[2],-16($tptr) 1189 mov $S[3],-8($tptr) 1190___ 1191} 1192############################################################## 1193# Montgomery reduction part, "word-by-word" algorithm. 1194# 1195{ 1196my ($topbit,$nptr)=("%rbp",$aptr); 1197my ($m0,$m1)=($a0,$a1); 1198my @Ni=("%rbx","%r9"); 1199$code.=<<___; 1200 mov 40(%rsp),$nptr # restore $nptr 1201 mov 48(%rsp),$n0 # restore *n0 1202 xor $j,$j 1203 mov $num,0(%rsp) # save $num 1204 sub $num,$j # $j=-$num 1205 mov 64(%rsp),$A0[0] # t[0] # modsched # 1206 mov $n0,$m0 # # modsched # 1207 lea 64(%rsp,$num,2),%rax # end of t[] buffer 1208 lea 64(%rsp,$num),$tptr # end of t[] window 1209 mov %rax,8(%rsp) # save end of t[] buffer 1210 lea ($nptr,$num),$nptr # end of n[] buffer 1211 xor $topbit,$topbit # $topbit=0 1212 1213 mov 0($nptr,$j),%rax # n[0] # modsched # 1214 mov 8($nptr,$j),$Ni[1] # n[1] # modsched # 1215 imulq $A0[0],$m0 # m0=t[0]*n0 # modsched # 1216 mov %rax,$Ni[0] # # modsched # 1217 jmp .Lsqr4x_mont_outer 1218 1219.align 16 1220.Lsqr4x_mont_outer: 1221 xor $A0[1],$A0[1] 1222 mul $m0 # n[0]*m0 1223 add %rax,$A0[0] # n[0]*m0+t[0] 1224 mov $Ni[1],%rax 1225 adc %rdx,$A0[1] 1226 mov $n0,$m1 1227 1228 xor $A0[0],$A0[0] 1229 add 8($tptr,$j),$A0[1] 1230 adc \$0,$A0[0] 1231 mul $m0 # n[1]*m0 1232 add %rax,$A0[1] # n[1]*m0+t[1] 1233 mov $Ni[0],%rax 1234 adc %rdx,$A0[0] 1235 1236 imulq $A0[1],$m1 1237 1238 mov 16($nptr,$j),$Ni[0] # n[2] 1239 xor $A1[1],$A1[1] 1240 add $A0[1],$A1[0] 1241 adc \$0,$A1[1] 1242 mul $m1 # n[0]*m1 1243 add %rax,$A1[0] # n[0]*m1+"t[1]" 1244 mov $Ni[0],%rax 1245 adc %rdx,$A1[1] 1246 mov $A1[0],8($tptr,$j) # "t[1]" 1247 1248 xor $A0[1],$A0[1] 1249 add 16($tptr,$j),$A0[0] 1250 adc \$0,$A0[1] 1251 mul $m0 # n[2]*m0 1252 add %rax,$A0[0] # n[2]*m0+t[2] 1253 mov $Ni[1],%rax 1254 adc %rdx,$A0[1] 1255 1256 mov 24($nptr,$j),$Ni[1] # n[3] 1257 xor $A1[0],$A1[0] 1258 add $A0[0],$A1[1] 1259 adc \$0,$A1[0] 1260 mul $m1 # n[1]*m1 1261 add %rax,$A1[1] # n[1]*m1+"t[2]" 1262 mov $Ni[1],%rax 1263 adc %rdx,$A1[0] 1264 mov $A1[1],16($tptr,$j) # "t[2]" 1265 1266 xor $A0[0],$A0[0] 1267 add 24($tptr,$j),$A0[1] 1268 lea 32($j),$j 1269 adc \$0,$A0[0] 1270 mul $m0 # n[3]*m0 1271 add %rax,$A0[1] # n[3]*m0+t[3] 1272 mov $Ni[0],%rax 1273 adc %rdx,$A0[0] 1274 jmp .Lsqr4x_mont_inner 1275 1276.align 16 1277.Lsqr4x_mont_inner: 1278 mov ($nptr,$j),$Ni[0] # n[4] 1279 xor $A1[1],$A1[1] 1280 add $A0[1],$A1[0] 1281 adc \$0,$A1[1] 1282 mul $m1 # n[2]*m1 1283 add %rax,$A1[0] # n[2]*m1+"t[3]" 1284 mov $Ni[0],%rax 1285 adc %rdx,$A1[1] 1286 mov $A1[0],-8($tptr,$j) # "t[3]" 1287 1288 xor $A0[1],$A0[1] 1289 add ($tptr,$j),$A0[0] 1290 adc \$0,$A0[1] 1291 mul $m0 # n[4]*m0 1292 add %rax,$A0[0] # n[4]*m0+t[4] 1293 mov $Ni[1],%rax 1294 adc %rdx,$A0[1] 1295 1296 mov 8($nptr,$j),$Ni[1] # n[5] 1297 xor $A1[0],$A1[0] 1298 add $A0[0],$A1[1] 1299 adc \$0,$A1[0] 1300 mul $m1 # n[3]*m1 1301 add %rax,$A1[1] # n[3]*m1+"t[4]" 1302 mov $Ni[1],%rax 1303 adc %rdx,$A1[0] 1304 mov $A1[1],($tptr,$j) # "t[4]" 1305 1306 xor $A0[0],$A0[0] 1307 add 8($tptr,$j),$A0[1] 1308 adc \$0,$A0[0] 1309 mul $m0 # n[5]*m0 1310 add %rax,$A0[1] # n[5]*m0+t[5] 1311 mov $Ni[0],%rax 1312 adc %rdx,$A0[0] 1313 1314 1315 mov 16($nptr,$j),$Ni[0] # n[6] 1316 xor $A1[1],$A1[1] 1317 add $A0[1],$A1[0] 1318 adc \$0,$A1[1] 1319 mul $m1 # n[4]*m1 1320 add %rax,$A1[0] # n[4]*m1+"t[5]" 1321 mov $Ni[0],%rax 1322 adc %rdx,$A1[1] 1323 mov $A1[0],8($tptr,$j) # "t[5]" 1324 1325 xor $A0[1],$A0[1] 1326 add 16($tptr,$j),$A0[0] 1327 adc \$0,$A0[1] 1328 mul $m0 # n[6]*m0 1329 add %rax,$A0[0] # n[6]*m0+t[6] 1330 mov $Ni[1],%rax 1331 adc %rdx,$A0[1] 1332 1333 mov 24($nptr,$j),$Ni[1] # n[7] 1334 xor $A1[0],$A1[0] 1335 add $A0[0],$A1[1] 1336 adc \$0,$A1[0] 1337 mul $m1 # n[5]*m1 1338 add %rax,$A1[1] # n[5]*m1+"t[6]" 1339 mov $Ni[1],%rax 1340 adc %rdx,$A1[0] 1341 mov $A1[1],16($tptr,$j) # "t[6]" 1342 1343 xor $A0[0],$A0[0] 1344 add 24($tptr,$j),$A0[1] 1345 lea 32($j),$j 1346 adc \$0,$A0[0] 1347 mul $m0 # n[7]*m0 1348 add %rax,$A0[1] # n[7]*m0+t[7] 1349 mov $Ni[0],%rax 1350 adc %rdx,$A0[0] 1351 cmp \$0,$j 1352 jne .Lsqr4x_mont_inner 1353 1354 sub 0(%rsp),$j # $j=-$num # modsched # 1355 mov $n0,$m0 # # modsched # 1356 1357 xor $A1[1],$A1[1] 1358 add $A0[1],$A1[0] 1359 adc \$0,$A1[1] 1360 mul $m1 # n[6]*m1 1361 add %rax,$A1[0] # n[6]*m1+"t[7]" 1362 mov $Ni[1],%rax 1363 adc %rdx,$A1[1] 1364 mov $A1[0],-8($tptr) # "t[7]" 1365 1366 xor $A0[1],$A0[1] 1367 add ($tptr),$A0[0] # +t[8] 1368 adc \$0,$A0[1] 1369 mov 0($nptr,$j),$Ni[0] # n[0] # modsched # 1370 add $topbit,$A0[0] 1371 adc \$0,$A0[1] 1372 1373 imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched # 1374 xor $A1[0],$A1[0] 1375 mov 8($nptr,$j),$Ni[1] # n[1] # modsched # 1376 add $A0[0],$A1[1] 1377 mov 16($tptr,$j),$A0[0] # t[0] # modsched # 1378 adc \$0,$A1[0] 1379 mul $m1 # n[7]*m1 1380 add %rax,$A1[1] # n[7]*m1+"t[8]" 1381 mov $Ni[0],%rax # # modsched # 1382 adc %rdx,$A1[0] 1383 mov $A1[1],($tptr) # "t[8]" 1384 1385 xor $topbit,$topbit 1386 add 8($tptr),$A1[0] # +t[9] 1387 adc $topbit,$topbit 1388 add $A0[1],$A1[0] 1389 lea 16($tptr),$tptr # "t[$num]>>128" 1390 adc \$0,$topbit 1391 mov $A1[0],-8($tptr) # "t[9]" 1392 cmp 8(%rsp),$tptr # are we done? 1393 jb .Lsqr4x_mont_outer 1394 1395 mov 0(%rsp),$num # restore $num 1396 mov $topbit,($tptr) # save $topbit 1397___ 1398} 1399############################################################## 1400# Post-condition, 4x unrolled copy from bn_mul_mont 1401# 1402{ 1403my ($tptr,$nptr)=("%rbx",$aptr); 1404my @ri=("%rax","%rdx","%r10","%r11"); 1405$code.=<<___; 1406 mov 64(%rsp,$num),@ri[0] # tp[0] 1407 lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result 1408 mov 40(%rsp),$nptr # restore $nptr 1409 shr \$5,$num # num/4 1410 mov 8($tptr),@ri[1] # t[1] 1411 xor $i,$i # i=0 and clear CF! 1412 1413 mov 32(%rsp),$rptr # restore $rptr 1414 sub 0($nptr),@ri[0] 1415 mov 16($tptr),@ri[2] # t[2] 1416 mov 24($tptr),@ri[3] # t[3] 1417 sbb 8($nptr),@ri[1] 1418 lea -1($num),$j # j=num/4-1 1419 jmp .Lsqr4x_sub 1420.align 16 1421.Lsqr4x_sub: 1422 mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] 1423 mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] 1424 sbb 16($nptr,$i,8),@ri[2] 1425 mov 32($tptr,$i,8),@ri[0] # tp[i+1] 1426 mov 40($tptr,$i,8),@ri[1] 1427 sbb 24($nptr,$i,8),@ri[3] 1428 mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] 1429 mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] 1430 sbb 32($nptr,$i,8),@ri[0] 1431 mov 48($tptr,$i,8),@ri[2] 1432 mov 56($tptr,$i,8),@ri[3] 1433 sbb 40($nptr,$i,8),@ri[1] 1434 lea 4($i),$i # i++ 1435 dec $j # doesn't affect CF! 1436 jnz .Lsqr4x_sub 1437 1438 mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] 1439 mov 32($tptr,$i,8),@ri[0] # load overflow bit 1440 sbb 16($nptr,$i,8),@ri[2] 1441 mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] 1442 sbb 24($nptr,$i,8),@ri[3] 1443 mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] 1444 1445 sbb \$0,@ri[0] # handle upmost overflow bit 1446 mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] 1447 xor $i,$i # i=0 1448 and @ri[0],$tptr 1449 not @ri[0] 1450 mov $rptr,$nptr 1451 and @ri[0],$nptr 1452 lea -1($num),$j 1453 or $nptr,$tptr # tp=borrow?tp:rp 1454 1455 pxor %xmm0,%xmm0 1456 lea 64(%rsp,$num,8),$nptr 1457 movdqu ($tptr),%xmm1 1458 lea ($nptr,$num,8),$nptr 1459 movdqa %xmm0,64(%rsp) # zap lower half of temporary vector 1460 movdqa %xmm0,($nptr) # zap upper half of temporary vector 1461 movdqu %xmm1,($rptr) 1462 jmp .Lsqr4x_copy 1463.align 16 1464.Lsqr4x_copy: # copy or in-place refresh 1465 movdqu 16($tptr,$i),%xmm2 1466 movdqu 32($tptr,$i),%xmm1 1467 movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector 1468 movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector 1469 movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector 1470 movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector 1471 movdqu %xmm2,16($rptr,$i) 1472 movdqu %xmm1,32($rptr,$i) 1473 lea 32($i),$i 1474 dec $j 1475 jnz .Lsqr4x_copy 1476 1477 movdqu 16($tptr,$i),%xmm2 1478 movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector 1479 movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector 1480 movdqu %xmm2,16($rptr,$i) 1481___ 1482} 1483$code.=<<___; 1484 mov 56(%rsp),%rsi # restore %rsp 1485 mov \$1,%rax 1486 mov 0(%rsi),%r15 1487 mov 8(%rsi),%r14 1488 mov 16(%rsi),%r13 1489 mov 24(%rsi),%r12 1490 mov 32(%rsi),%rbp 1491 mov 40(%rsi),%rbx 1492 lea 48(%rsi),%rsp 1493.Lsqr4x_epilogue: 1494 ret 1495.size bn_sqr4x_mont,.-bn_sqr4x_mont 1496___ 1497}}} 1498$code.=<<___; 1499.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1500.align 16 1501___ 1502 1503print $code; 1504close STDOUT; 1505