1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# August 2011. 11# 12# Companion to x86_64-mont.pl that optimizes cache-timing attack 13# countermeasures. The subroutines are produced by replacing bp[i] 14# references in their x86_64-mont.pl counterparts with cache-neutral 15# references to powers table computed in BN_mod_exp_mont_consttime. 16# In addition subroutine that scatters elements of the powers table 17# is implemented, so that scatter-/gathering can be tuned without 18# bn_exp.c modifications. 19 20$flavour = shift; 21$output = shift; 22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 23 24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 25 26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 29die "can't locate x86_64-xlate.pl"; 30 31open OUT,"| \"$^X\" $xlate $flavour $output"; 32*STDOUT=*OUT; 33 34# int bn_mul_mont_gather5( 35$rp="%rdi"; # BN_ULONG *rp, 36$ap="%rsi"; # const BN_ULONG *ap, 37$bp="%rdx"; # const BN_ULONG *bp, 38$np="%rcx"; # const BN_ULONG *np, 39$n0="%r8"; # const BN_ULONG *n0, 40$num="%r9"; # int num, 41 # int idx); # 0 to 2^5-1, "index" in $bp holding 42 # pre-computed powers of a', interlaced 43 # in such manner that b[0] is $bp[idx], 44 # b[1] is [2^5+idx], etc. 45$lo0="%r10"; 46$hi0="%r11"; 47$hi1="%r13"; 48$i="%r14"; 49$j="%r15"; 50$m0="%rbx"; 51$m1="%rbp"; 52 53$code=<<___; 54.text 55 56.globl bn_mul_mont_gather5 57.type bn_mul_mont_gather5,\@function,6 58.align 64 59bn_mul_mont_gather5: 60 test \$3,${num}d 61 jnz .Lmul_enter 62 cmp \$8,${num}d 63 jb .Lmul_enter 64 jmp .Lmul4x_enter 65 66.align 16 67.Lmul_enter: 68 mov ${num}d,${num}d 69 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument 70 lea .Linc(%rip),%r10 71 push %rbx 72 push %rbp 73 push %r12 74 push %r13 75 push %r14 76 push %r15 77 78.Lmul_alloca: 79 mov %rsp,%rax 80 lea 2($num),%r11 81 neg %r11 82 lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) 83 and \$-1024,%rsp # minimize TLB usage 84 85 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 86.Lmul_body: 87 lea 128($bp),%r12 # reassign $bp (+size optimization) 88___ 89 $bp="%r12"; 90 $STRIDE=2**5*8; # 5 is "window size" 91 $N=$STRIDE/4; # should match cache line size 92$code.=<<___; 93 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 94 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 95 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) 96 and \$-16,%r10 97 98 pshufd \$0,%xmm5,%xmm5 # broadcast index 99 movdqa %xmm1,%xmm4 100 movdqa %xmm1,%xmm2 101___ 102######################################################################## 103# calculate mask by comparing 0..31 to index and save result to stack 104# 105$code.=<<___; 106 paddd %xmm0,%xmm1 107 pcmpeqd %xmm5,%xmm0 # compare to 1,0 108 .byte 0x67 109 movdqa %xmm4,%xmm3 110___ 111for($k=0;$k<$STRIDE/16-4;$k+=4) { 112$code.=<<___; 113 paddd %xmm1,%xmm2 114 pcmpeqd %xmm5,%xmm1 # compare to 3,2 115 movdqa %xmm0,`16*($k+0)+112`(%r10) 116 movdqa %xmm4,%xmm0 117 118 paddd %xmm2,%xmm3 119 pcmpeqd %xmm5,%xmm2 # compare to 5,4 120 movdqa %xmm1,`16*($k+1)+112`(%r10) 121 movdqa %xmm4,%xmm1 122 123 paddd %xmm3,%xmm0 124 pcmpeqd %xmm5,%xmm3 # compare to 7,6 125 movdqa %xmm2,`16*($k+2)+112`(%r10) 126 movdqa %xmm4,%xmm2 127 128 paddd %xmm0,%xmm1 129 pcmpeqd %xmm5,%xmm0 130 movdqa %xmm3,`16*($k+3)+112`(%r10) 131 movdqa %xmm4,%xmm3 132___ 133} 134$code.=<<___; # last iteration can be optimized 135 paddd %xmm1,%xmm2 136 pcmpeqd %xmm5,%xmm1 137 movdqa %xmm0,`16*($k+0)+112`(%r10) 138 139 paddd %xmm2,%xmm3 140 .byte 0x67 141 pcmpeqd %xmm5,%xmm2 142 movdqa %xmm1,`16*($k+1)+112`(%r10) 143 144 pcmpeqd %xmm5,%xmm3 145 movdqa %xmm2,`16*($k+2)+112`(%r10) 146 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register 147 148 pand `16*($k+1)-128`($bp),%xmm1 149 pand `16*($k+2)-128`($bp),%xmm2 150 movdqa %xmm3,`16*($k+3)+112`(%r10) 151 pand `16*($k+3)-128`($bp),%xmm3 152 por %xmm2,%xmm0 153 por %xmm3,%xmm1 154___ 155for($k=0;$k<$STRIDE/16-4;$k+=4) { 156$code.=<<___; 157 movdqa `16*($k+0)-128`($bp),%xmm4 158 movdqa `16*($k+1)-128`($bp),%xmm5 159 movdqa `16*($k+2)-128`($bp),%xmm2 160 pand `16*($k+0)+112`(%r10),%xmm4 161 movdqa `16*($k+3)-128`($bp),%xmm3 162 pand `16*($k+1)+112`(%r10),%xmm5 163 por %xmm4,%xmm0 164 pand `16*($k+2)+112`(%r10),%xmm2 165 por %xmm5,%xmm1 166 pand `16*($k+3)+112`(%r10),%xmm3 167 por %xmm2,%xmm0 168 por %xmm3,%xmm1 169___ 170} 171$code.=<<___; 172 por %xmm1,%xmm0 173 pshufd \$0x4e,%xmm0,%xmm1 174 por %xmm1,%xmm0 175 lea $STRIDE($bp),$bp 176 movd %xmm0,$m0 # m0=bp[0] 177 178 mov ($n0),$n0 # pull n0[0] value 179 mov ($ap),%rax 180 181 xor $i,$i # i=0 182 xor $j,$j # j=0 183 184 mov $n0,$m1 185 mulq $m0 # ap[0]*bp[0] 186 mov %rax,$lo0 187 mov ($np),%rax 188 189 imulq $lo0,$m1 # "tp[0]"*n0 190 mov %rdx,$hi0 191 192 mulq $m1 # np[0]*m1 193 add %rax,$lo0 # discarded 194 mov 8($ap),%rax 195 adc \$0,%rdx 196 mov %rdx,$hi1 197 198 lea 1($j),$j # j++ 199 jmp .L1st_enter 200 201.align 16 202.L1st: 203 add %rax,$hi1 204 mov ($ap,$j,8),%rax 205 adc \$0,%rdx 206 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 207 mov $lo0,$hi0 208 adc \$0,%rdx 209 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 210 mov %rdx,$hi1 211 212.L1st_enter: 213 mulq $m0 # ap[j]*bp[0] 214 add %rax,$hi0 215 mov ($np,$j,8),%rax 216 adc \$0,%rdx 217 lea 1($j),$j # j++ 218 mov %rdx,$lo0 219 220 mulq $m1 # np[j]*m1 221 cmp $num,$j 222 jl .L1st 223 224 add %rax,$hi1 225 mov ($ap),%rax # ap[0] 226 adc \$0,%rdx 227 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 228 adc \$0,%rdx 229 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 230 mov %rdx,$hi1 231 mov $lo0,$hi0 232 233 xor %rdx,%rdx 234 add $hi0,$hi1 235 adc \$0,%rdx 236 mov $hi1,-8(%rsp,$num,8) 237 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 238 239 lea 1($i),$i # i++ 240 jmp .Louter 241.align 16 242.Louter: 243 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) 244 and \$-16,%rdx 245 pxor %xmm4,%xmm4 246 pxor %xmm5,%xmm5 247___ 248for($k=0;$k<$STRIDE/16;$k+=4) { 249$code.=<<___; 250 movdqa `16*($k+0)-128`($bp),%xmm0 251 movdqa `16*($k+1)-128`($bp),%xmm1 252 movdqa `16*($k+2)-128`($bp),%xmm2 253 movdqa `16*($k+3)-128`($bp),%xmm3 254 pand `16*($k+0)-128`(%rdx),%xmm0 255 pand `16*($k+1)-128`(%rdx),%xmm1 256 por %xmm0,%xmm4 257 pand `16*($k+2)-128`(%rdx),%xmm2 258 por %xmm1,%xmm5 259 pand `16*($k+3)-128`(%rdx),%xmm3 260 por %xmm2,%xmm4 261 por %xmm3,%xmm5 262___ 263} 264$code.=<<___; 265 por %xmm5,%xmm4 266 pshufd \$0x4e,%xmm4,%xmm0 267 por %xmm4,%xmm0 268 lea $STRIDE($bp),$bp 269 movd %xmm0,$m0 # m0=bp[i] 270 271 xor $j,$j # j=0 272 mov $n0,$m1 273 mov (%rsp),$lo0 274 275 mulq $m0 # ap[0]*bp[i] 276 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 277 mov ($np),%rax 278 adc \$0,%rdx 279 280 imulq $lo0,$m1 # tp[0]*n0 281 mov %rdx,$hi0 282 283 mulq $m1 # np[0]*m1 284 add %rax,$lo0 # discarded 285 mov 8($ap),%rax 286 adc \$0,%rdx 287 mov 8(%rsp),$lo0 # tp[1] 288 mov %rdx,$hi1 289 290 lea 1($j),$j # j++ 291 jmp .Linner_enter 292 293.align 16 294.Linner: 295 add %rax,$hi1 296 mov ($ap,$j,8),%rax 297 adc \$0,%rdx 298 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 299 mov (%rsp,$j,8),$lo0 300 adc \$0,%rdx 301 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 302 mov %rdx,$hi1 303 304.Linner_enter: 305 mulq $m0 # ap[j]*bp[i] 306 add %rax,$hi0 307 mov ($np,$j,8),%rax 308 adc \$0,%rdx 309 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 310 mov %rdx,$hi0 311 adc \$0,$hi0 312 lea 1($j),$j # j++ 313 314 mulq $m1 # np[j]*m1 315 cmp $num,$j 316 jl .Linner 317 318 add %rax,$hi1 319 mov ($ap),%rax # ap[0] 320 adc \$0,%rdx 321 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 322 mov (%rsp,$j,8),$lo0 323 adc \$0,%rdx 324 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 325 mov %rdx,$hi1 326 327 xor %rdx,%rdx 328 add $hi0,$hi1 329 adc \$0,%rdx 330 add $lo0,$hi1 # pull upmost overflow bit 331 adc \$0,%rdx 332 mov $hi1,-8(%rsp,$num,8) 333 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 334 335 lea 1($i),$i # i++ 336 cmp $num,$i 337 jl .Louter 338 339 xor $i,$i # i=0 and clear CF! 340 mov (%rsp),%rax # tp[0] 341 lea (%rsp),$ap # borrow ap for tp 342 mov $num,$j # j=num 343 jmp .Lsub 344.align 16 345.Lsub: sbb ($np,$i,8),%rax 346 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 347 mov 8($ap,$i,8),%rax # tp[i+1] 348 lea 1($i),$i # i++ 349 dec $j # doesnn't affect CF! 350 jnz .Lsub 351 352 sbb \$0,%rax # handle upmost overflow bit 353 xor $i,$i 354 and %rax,$ap 355 not %rax 356 mov $rp,$np 357 and %rax,$np 358 mov $num,$j # j=num 359 or $np,$ap # ap=borrow?tp:rp 360.align 16 361.Lcopy: # copy or in-place refresh 362 mov ($ap,$i,8),%rax 363 mov $i,(%rsp,$i,8) # zap temporary vector 364 mov %rax,($rp,$i,8) # rp[i]=tp[i] 365 lea 1($i),$i 366 sub \$1,$j 367 jnz .Lcopy 368 369 mov 8(%rsp,$num,8),%rsi # restore %rsp 370 mov \$1,%rax 371 372 mov (%rsi),%r15 373 mov 8(%rsi),%r14 374 mov 16(%rsi),%r13 375 mov 24(%rsi),%r12 376 mov 32(%rsi),%rbp 377 mov 40(%rsi),%rbx 378 lea 48(%rsi),%rsp 379.Lmul_epilogue: 380 ret 381.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 382___ 383{{{ 384my @A=("%r10","%r11"); 385my @N=("%r13","%rdi"); 386$code.=<<___; 387.type bn_mul4x_mont_gather5,\@function,6 388.align 16 389bn_mul4x_mont_gather5: 390.Lmul4x_enter: 391 mov ${num}d,${num}d 392 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument 393 lea .Linc(%rip),%r10 394 push %rbx 395 push %rbp 396 push %r12 397 push %r13 398 push %r14 399 push %r15 400 401.Lmul4x_alloca: 402 mov %rsp,%rax 403 lea 4($num),%r11 404 neg %r11 405 lea -256(%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)+256) 406 and \$-1024,%rsp # minimize TLB usage 407 408 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 409.Lmul4x_body: 410 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp 411 lea 128(%rdx),%r12 # reassign $bp (+size optimization) 412___ 413 $bp="%r12"; 414 $STRIDE=2**5*8; # 5 is "window size" 415 $N=$STRIDE/4; # should match cache line size 416$code.=<<___; 417 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 418 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 419 lea 32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization) 420 421 pshufd \$0,%xmm5,%xmm5 # broadcast index 422 movdqa %xmm1,%xmm4 423 .byte 0x67,0x67 424 movdqa %xmm1,%xmm2 425___ 426######################################################################## 427# calculate mask by comparing 0..31 to index and save result to stack 428# 429$code.=<<___; 430 paddd %xmm0,%xmm1 431 pcmpeqd %xmm5,%xmm0 # compare to 1,0 432 .byte 0x67 433 movdqa %xmm4,%xmm3 434___ 435for($k=0;$k<$STRIDE/16-4;$k+=4) { 436$code.=<<___; 437 paddd %xmm1,%xmm2 438 pcmpeqd %xmm5,%xmm1 # compare to 3,2 439 movdqa %xmm0,`16*($k+0)+112`(%r10) 440 movdqa %xmm4,%xmm0 441 442 paddd %xmm2,%xmm3 443 pcmpeqd %xmm5,%xmm2 # compare to 5,4 444 movdqa %xmm1,`16*($k+1)+112`(%r10) 445 movdqa %xmm4,%xmm1 446 447 paddd %xmm3,%xmm0 448 pcmpeqd %xmm5,%xmm3 # compare to 7,6 449 movdqa %xmm2,`16*($k+2)+112`(%r10) 450 movdqa %xmm4,%xmm2 451 452 paddd %xmm0,%xmm1 453 pcmpeqd %xmm5,%xmm0 454 movdqa %xmm3,`16*($k+3)+112`(%r10) 455 movdqa %xmm4,%xmm3 456___ 457} 458$code.=<<___; # last iteration can be optimized 459 paddd %xmm1,%xmm2 460 pcmpeqd %xmm5,%xmm1 461 movdqa %xmm0,`16*($k+0)+112`(%r10) 462 463 paddd %xmm2,%xmm3 464 .byte 0x67 465 pcmpeqd %xmm5,%xmm2 466 movdqa %xmm1,`16*($k+1)+112`(%r10) 467 468 pcmpeqd %xmm5,%xmm3 469 movdqa %xmm2,`16*($k+2)+112`(%r10) 470 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register 471 472 pand `16*($k+1)-128`($bp),%xmm1 473 pand `16*($k+2)-128`($bp),%xmm2 474 movdqa %xmm3,`16*($k+3)+112`(%r10) 475 pand `16*($k+3)-128`($bp),%xmm3 476 por %xmm2,%xmm0 477 por %xmm3,%xmm1 478___ 479for($k=0;$k<$STRIDE/16-4;$k+=4) { 480$code.=<<___; 481 movdqa `16*($k+0)-128`($bp),%xmm4 482 movdqa `16*($k+1)-128`($bp),%xmm5 483 movdqa `16*($k+2)-128`($bp),%xmm2 484 pand `16*($k+0)+112`(%r10),%xmm4 485 movdqa `16*($k+3)-128`($bp),%xmm3 486 pand `16*($k+1)+112`(%r10),%xmm5 487 por %xmm4,%xmm0 488 pand `16*($k+2)+112`(%r10),%xmm2 489 por %xmm5,%xmm1 490 pand `16*($k+3)+112`(%r10),%xmm3 491 por %xmm2,%xmm0 492 por %xmm3,%xmm1 493___ 494} 495$code.=<<___; 496 por %xmm1,%xmm0 497 pshufd \$0x4e,%xmm0,%xmm1 498 por %xmm1,%xmm0 499 lea $STRIDE($bp),$bp 500 movd %xmm0,$m0 # m0=bp[0] 501 502 mov ($n0),$n0 # pull n0[0] value 503 mov ($ap),%rax 504 505 xor $i,$i # i=0 506 xor $j,$j # j=0 507 508 mov $n0,$m1 509 mulq $m0 # ap[0]*bp[0] 510 mov %rax,$A[0] 511 mov ($np),%rax 512 513 imulq $A[0],$m1 # "tp[0]"*n0 514 mov %rdx,$A[1] 515 516 mulq $m1 # np[0]*m1 517 add %rax,$A[0] # discarded 518 mov 8($ap),%rax 519 adc \$0,%rdx 520 mov %rdx,$N[1] 521 522 mulq $m0 523 add %rax,$A[1] 524 mov 8($np),%rax 525 adc \$0,%rdx 526 mov %rdx,$A[0] 527 528 mulq $m1 529 add %rax,$N[1] 530 mov 16($ap),%rax 531 adc \$0,%rdx 532 add $A[1],$N[1] 533 lea 4($j),$j # j++ 534 adc \$0,%rdx 535 mov $N[1],(%rsp) 536 mov %rdx,$N[0] 537 jmp .L1st4x 538.align 16 539.L1st4x: 540 mulq $m0 # ap[j]*bp[0] 541 add %rax,$A[0] 542 mov -16($np,$j,8),%rax 543 adc \$0,%rdx 544 mov %rdx,$A[1] 545 546 mulq $m1 # np[j]*m1 547 add %rax,$N[0] 548 mov -8($ap,$j,8),%rax 549 adc \$0,%rdx 550 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 551 adc \$0,%rdx 552 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 553 mov %rdx,$N[1] 554 555 mulq $m0 # ap[j]*bp[0] 556 add %rax,$A[1] 557 mov -8($np,$j,8),%rax 558 adc \$0,%rdx 559 mov %rdx,$A[0] 560 561 mulq $m1 # np[j]*m1 562 add %rax,$N[1] 563 mov ($ap,$j,8),%rax 564 adc \$0,%rdx 565 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 566 adc \$0,%rdx 567 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 568 mov %rdx,$N[0] 569 570 mulq $m0 # ap[j]*bp[0] 571 add %rax,$A[0] 572 mov ($np,$j,8),%rax 573 adc \$0,%rdx 574 mov %rdx,$A[1] 575 576 mulq $m1 # np[j]*m1 577 add %rax,$N[0] 578 mov 8($ap,$j,8),%rax 579 adc \$0,%rdx 580 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 581 adc \$0,%rdx 582 mov $N[0],-8(%rsp,$j,8) # tp[j-1] 583 mov %rdx,$N[1] 584 585 mulq $m0 # ap[j]*bp[0] 586 add %rax,$A[1] 587 mov 8($np,$j,8),%rax 588 adc \$0,%rdx 589 lea 4($j),$j # j++ 590 mov %rdx,$A[0] 591 592 mulq $m1 # np[j]*m1 593 add %rax,$N[1] 594 mov -16($ap,$j,8),%rax 595 adc \$0,%rdx 596 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 597 adc \$0,%rdx 598 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 599 mov %rdx,$N[0] 600 cmp $num,$j 601 jl .L1st4x 602 603 mulq $m0 # ap[j]*bp[0] 604 add %rax,$A[0] 605 mov -16($np,$j,8),%rax 606 adc \$0,%rdx 607 mov %rdx,$A[1] 608 609 mulq $m1 # np[j]*m1 610 add %rax,$N[0] 611 mov -8($ap,$j,8),%rax 612 adc \$0,%rdx 613 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 614 adc \$0,%rdx 615 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 616 mov %rdx,$N[1] 617 618 mulq $m0 # ap[j]*bp[0] 619 add %rax,$A[1] 620 mov -8($np,$j,8),%rax 621 adc \$0,%rdx 622 mov %rdx,$A[0] 623 624 mulq $m1 # np[j]*m1 625 add %rax,$N[1] 626 mov ($ap),%rax # ap[0] 627 adc \$0,%rdx 628 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 629 adc \$0,%rdx 630 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 631 mov %rdx,$N[0] 632 633 xor $N[1],$N[1] 634 add $A[0],$N[0] 635 adc \$0,$N[1] 636 mov $N[0],-8(%rsp,$j,8) 637 mov $N[1],(%rsp,$j,8) # store upmost overflow bit 638 639 lea 1($i),$i # i++ 640.align 4 641.Louter4x: 642 lea 32+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) 643 pxor %xmm4,%xmm4 644 pxor %xmm5,%xmm5 645___ 646for($k=0;$k<$STRIDE/16;$k+=4) { 647$code.=<<___; 648 movdqa `16*($k+0)-128`($bp),%xmm0 649 movdqa `16*($k+1)-128`($bp),%xmm1 650 movdqa `16*($k+2)-128`($bp),%xmm2 651 movdqa `16*($k+3)-128`($bp),%xmm3 652 pand `16*($k+0)-128`(%rdx),%xmm0 653 pand `16*($k+1)-128`(%rdx),%xmm1 654 por %xmm0,%xmm4 655 pand `16*($k+2)-128`(%rdx),%xmm2 656 por %xmm1,%xmm5 657 pand `16*($k+3)-128`(%rdx),%xmm3 658 por %xmm2,%xmm4 659 por %xmm3,%xmm5 660___ 661} 662$code.=<<___; 663 por %xmm5,%xmm4 664 pshufd \$0x4e,%xmm4,%xmm0 665 por %xmm4,%xmm0 666 lea $STRIDE($bp),$bp 667 movd %xmm0,$m0 # m0=bp[i] 668 669 xor $j,$j # j=0 670 671 mov (%rsp),$A[0] 672 mov $n0,$m1 673 mulq $m0 # ap[0]*bp[i] 674 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 675 mov ($np),%rax 676 adc \$0,%rdx 677 678 imulq $A[0],$m1 # tp[0]*n0 679 mov %rdx,$A[1] 680 681 mulq $m1 # np[0]*m1 682 add %rax,$A[0] # "$N[0]", discarded 683 mov 8($ap),%rax 684 adc \$0,%rdx 685 mov %rdx,$N[1] 686 687 mulq $m0 # ap[j]*bp[i] 688 add %rax,$A[1] 689 mov 8($np),%rax 690 adc \$0,%rdx 691 add 8(%rsp),$A[1] # +tp[1] 692 adc \$0,%rdx 693 mov %rdx,$A[0] 694 695 mulq $m1 # np[j]*m1 696 add %rax,$N[1] 697 mov 16($ap),%rax 698 adc \$0,%rdx 699 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 700 lea 4($j),$j # j+=2 701 adc \$0,%rdx 702 mov %rdx,$N[0] 703 jmp .Linner4x 704.align 16 705.Linner4x: 706 mulq $m0 # ap[j]*bp[i] 707 add %rax,$A[0] 708 mov -16($np,$j,8),%rax 709 adc \$0,%rdx 710 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 711 adc \$0,%rdx 712 mov %rdx,$A[1] 713 714 mulq $m1 # np[j]*m1 715 add %rax,$N[0] 716 mov -8($ap,$j,8),%rax 717 adc \$0,%rdx 718 add $A[0],$N[0] 719 adc \$0,%rdx 720 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 721 mov %rdx,$N[1] 722 723 mulq $m0 # ap[j]*bp[i] 724 add %rax,$A[1] 725 mov -8($np,$j,8),%rax 726 adc \$0,%rdx 727 add -8(%rsp,$j,8),$A[1] 728 adc \$0,%rdx 729 mov %rdx,$A[0] 730 731 mulq $m1 # np[j]*m1 732 add %rax,$N[1] 733 mov ($ap,$j,8),%rax 734 adc \$0,%rdx 735 add $A[1],$N[1] 736 adc \$0,%rdx 737 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 738 mov %rdx,$N[0] 739 740 mulq $m0 # ap[j]*bp[i] 741 add %rax,$A[0] 742 mov ($np,$j,8),%rax 743 adc \$0,%rdx 744 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 745 adc \$0,%rdx 746 mov %rdx,$A[1] 747 748 mulq $m1 # np[j]*m1 749 add %rax,$N[0] 750 mov 8($ap,$j,8),%rax 751 adc \$0,%rdx 752 add $A[0],$N[0] 753 adc \$0,%rdx 754 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 755 mov %rdx,$N[1] 756 757 mulq $m0 # ap[j]*bp[i] 758 add %rax,$A[1] 759 mov 8($np,$j,8),%rax 760 adc \$0,%rdx 761 add 8(%rsp,$j,8),$A[1] 762 adc \$0,%rdx 763 lea 4($j),$j # j++ 764 mov %rdx,$A[0] 765 766 mulq $m1 # np[j]*m1 767 add %rax,$N[1] 768 mov -16($ap,$j,8),%rax 769 adc \$0,%rdx 770 add $A[1],$N[1] 771 adc \$0,%rdx 772 mov $N[0],-40(%rsp,$j,8) # tp[j-1] 773 mov %rdx,$N[0] 774 cmp $num,$j 775 jl .Linner4x 776 777 mulq $m0 # ap[j]*bp[i] 778 add %rax,$A[0] 779 mov -16($np,$j,8),%rax 780 adc \$0,%rdx 781 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 782 adc \$0,%rdx 783 mov %rdx,$A[1] 784 785 mulq $m1 # np[j]*m1 786 add %rax,$N[0] 787 mov -8($ap,$j,8),%rax 788 adc \$0,%rdx 789 add $A[0],$N[0] 790 adc \$0,%rdx 791 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 792 mov %rdx,$N[1] 793 794 mulq $m0 # ap[j]*bp[i] 795 add %rax,$A[1] 796 mov -8($np,$j,8),%rax 797 adc \$0,%rdx 798 add -8(%rsp,$j,8),$A[1] 799 adc \$0,%rdx 800 lea 1($i),$i # i++ 801 mov %rdx,$A[0] 802 803 mulq $m1 # np[j]*m1 804 add %rax,$N[1] 805 mov ($ap),%rax # ap[0] 806 adc \$0,%rdx 807 add $A[1],$N[1] 808 adc \$0,%rdx 809 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 810 mov %rdx,$N[0] 811 812 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 813 814 xor $N[1],$N[1] 815 add $A[0],$N[0] 816 adc \$0,$N[1] 817 add (%rsp,$num,8),$N[0] # pull upmost overflow bit 818 adc \$0,$N[1] 819 mov $N[0],-8(%rsp,$j,8) 820 mov $N[1],(%rsp,$j,8) # store upmost overflow bit 821 822 cmp $num,$i 823 jl .Louter4x 824___ 825{ 826my @ri=("%rax","%rdx",$m0,$m1); 827$code.=<<___; 828 mov 16(%rsp,$num,8),$rp # restore $rp 829 mov 0(%rsp),@ri[0] # tp[0] 830 pxor %xmm0,%xmm0 831 mov 8(%rsp),@ri[1] # tp[1] 832 shr \$2,$num # num/=4 833 lea (%rsp),$ap # borrow ap for tp 834 xor $i,$i # i=0 and clear CF! 835 836 sub 0($np),@ri[0] 837 mov 16($ap),@ri[2] # tp[2] 838 mov 24($ap),@ri[3] # tp[3] 839 sbb 8($np),@ri[1] 840 lea -1($num),$j # j=num/4-1 841 jmp .Lsub4x 842.align 16 843.Lsub4x: 844 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 845 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 846 sbb 16($np,$i,8),@ri[2] 847 mov 32($ap,$i,8),@ri[0] # tp[i+1] 848 mov 40($ap,$i,8),@ri[1] 849 sbb 24($np,$i,8),@ri[3] 850 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 851 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 852 sbb 32($np,$i,8),@ri[0] 853 mov 48($ap,$i,8),@ri[2] 854 mov 56($ap,$i,8),@ri[3] 855 sbb 40($np,$i,8),@ri[1] 856 lea 4($i),$i # i++ 857 dec $j # doesnn't affect CF! 858 jnz .Lsub4x 859 860 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 861 mov 32($ap,$i,8),@ri[0] # load overflow bit 862 sbb 16($np,$i,8),@ri[2] 863 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 864 sbb 24($np,$i,8),@ri[3] 865 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 866 867 sbb \$0,@ri[0] # handle upmost overflow bit 868 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 869 xor $i,$i # i=0 870 and @ri[0],$ap 871 not @ri[0] 872 mov $rp,$np 873 and @ri[0],$np 874 lea -1($num),$j 875 or $np,$ap # ap=borrow?tp:rp 876 877 movdqu ($ap),%xmm1 878 movdqa %xmm0,(%rsp) 879 movdqu %xmm1,($rp) 880 jmp .Lcopy4x 881.align 16 882.Lcopy4x: # copy or in-place refresh 883 movdqu 16($ap,$i),%xmm2 884 movdqu 32($ap,$i),%xmm1 885 movdqa %xmm0,16(%rsp,$i) 886 movdqu %xmm2,16($rp,$i) 887 movdqa %xmm0,32(%rsp,$i) 888 movdqu %xmm1,32($rp,$i) 889 lea 32($i),$i 890 dec $j 891 jnz .Lcopy4x 892 893 shl \$2,$num 894 movdqu 16($ap,$i),%xmm2 895 movdqa %xmm0,16(%rsp,$i) 896 movdqu %xmm2,16($rp,$i) 897___ 898} 899$code.=<<___; 900 mov 8(%rsp,$num,8),%rsi # restore %rsp 901 mov \$1,%rax 902 903 mov (%rsi),%r15 904 mov 8(%rsi),%r14 905 mov 16(%rsi),%r13 906 mov 24(%rsi),%r12 907 mov 32(%rsi),%rbp 908 mov 40(%rsi),%rbx 909 lea 48(%rsi),%rsp 910.Lmul4x_epilogue: 911 ret 912.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 913___ 914}}} 915 916{ 917my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order 918 ("%rdi","%rsi","%rdx","%ecx"); # Unix order 919my $out=$inp; 920my $STRIDE=2**5*8; 921my $N=$STRIDE/4; 922 923$code.=<<___; 924.globl bn_scatter5 925.type bn_scatter5,\@abi-omnipotent 926.align 16 927bn_scatter5: 928 cmp \$0, $num 929 jz .Lscatter_epilogue 930 lea ($tbl,$idx,8),$tbl 931.Lscatter: 932 mov ($inp),%rax 933 lea 8($inp),$inp 934 mov %rax,($tbl) 935 lea 32*8($tbl),$tbl 936 sub \$1,$num 937 jnz .Lscatter 938.Lscatter_epilogue: 939 ret 940.size bn_scatter5,.-bn_scatter5 941 942.globl bn_gather5 943.type bn_gather5,\@abi-omnipotent 944.align 16 945bn_gather5: 946.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases 947 # I can't trust assembler to use specific encoding:-( 948 .byte 0x4c,0x8d,0x14,0x24 # lea (%rsp),%r10 949 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 # sub $0x108,%rsp 950 lea .Linc(%rip),%rax 951 and \$-16,%rsp # shouldn't be formally required 952 953 movd $idx,%xmm5 954 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 955 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 956 lea 128($tbl),%r11 # size optimization 957 lea 128(%rsp),%rax # size optimization 958 959 pshufd \$0,%xmm5,%xmm5 # broadcast $idx 960 movdqa %xmm1,%xmm4 961 movdqa %xmm1,%xmm2 962___ 963######################################################################## 964# calculate mask by comparing 0..31 to $idx and save result to stack 965# 966for($i=0;$i<$STRIDE/16;$i+=4) { 967$code.=<<___; 968 paddd %xmm0,%xmm1 969 pcmpeqd %xmm5,%xmm0 # compare to 1,0 970___ 971$code.=<<___ if ($i); 972 movdqa %xmm3,`16*($i-1)-128`(%rax) 973___ 974$code.=<<___; 975 movdqa %xmm4,%xmm3 976 977 paddd %xmm1,%xmm2 978 pcmpeqd %xmm5,%xmm1 # compare to 3,2 979 movdqa %xmm0,`16*($i+0)-128`(%rax) 980 movdqa %xmm4,%xmm0 981 982 paddd %xmm2,%xmm3 983 pcmpeqd %xmm5,%xmm2 # compare to 5,4 984 movdqa %xmm1,`16*($i+1)-128`(%rax) 985 movdqa %xmm4,%xmm1 986 987 paddd %xmm3,%xmm0 988 pcmpeqd %xmm5,%xmm3 # compare to 7,6 989 movdqa %xmm2,`16*($i+2)-128`(%rax) 990 movdqa %xmm4,%xmm2 991___ 992} 993$code.=<<___; 994 movdqa %xmm3,`16*($i-1)-128`(%rax) 995 jmp .Lgather 996 997.align 32 998.Lgather: 999 pxor %xmm4,%xmm4 1000 pxor %xmm5,%xmm5 1001___ 1002for($i=0;$i<$STRIDE/16;$i+=4) { 1003$code.=<<___; 1004 movdqa `16*($i+0)-128`(%r11),%xmm0 1005 movdqa `16*($i+1)-128`(%r11),%xmm1 1006 movdqa `16*($i+2)-128`(%r11),%xmm2 1007 pand `16*($i+0)-128`(%rax),%xmm0 1008 movdqa `16*($i+3)-128`(%r11),%xmm3 1009 pand `16*($i+1)-128`(%rax),%xmm1 1010 por %xmm0,%xmm4 1011 pand `16*($i+2)-128`(%rax),%xmm2 1012 por %xmm1,%xmm5 1013 pand `16*($i+3)-128`(%rax),%xmm3 1014 por %xmm2,%xmm4 1015 por %xmm3,%xmm5 1016___ 1017} 1018$code.=<<___; 1019 por %xmm5,%xmm4 1020 lea $STRIDE(%r11),%r11 1021 pshufd \$0x4e,%xmm4,%xmm0 1022 por %xmm4,%xmm0 1023 movq %xmm0,($out) # m0=bp[0] 1024 lea 8($out),$out 1025 sub \$1,$num 1026 jnz .Lgather 1027 1028 lea (%r10),%rsp 1029 ret 1030.LSEH_end_bn_gather5: 1031.size bn_gather5,.-bn_gather5 1032___ 1033} 1034$code.=<<___; 1035.align 64 1036.Linc: 1037 .long 0,0, 1,1 1038 .long 2,2, 2,2 1039.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1040___ 1041 1042# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1043# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1044if ($win64) { 1045$rec="%rcx"; 1046$frame="%rdx"; 1047$context="%r8"; 1048$disp="%r9"; 1049 1050$code.=<<___; 1051.extern __imp_RtlVirtualUnwind 1052.type mul_handler,\@abi-omnipotent 1053.align 16 1054mul_handler: 1055 push %rsi 1056 push %rdi 1057 push %rbx 1058 push %rbp 1059 push %r12 1060 push %r13 1061 push %r14 1062 push %r15 1063 pushfq 1064 sub \$64,%rsp 1065 1066 mov 120($context),%rax # pull context->Rax 1067 mov 248($context),%rbx # pull context->Rip 1068 1069 mov 8($disp),%rsi # disp->ImageBase 1070 mov 56($disp),%r11 # disp->HandlerData 1071 1072 mov 0(%r11),%r10d # HandlerData[0] 1073 lea (%rsi,%r10),%r10 # end of prologue label 1074 cmp %r10,%rbx # context->Rip<end of prologue label 1075 jb .Lcommon_seh_tail 1076 1077 lea 48(%rax),%rax 1078 1079 mov 4(%r11),%r10d # HandlerData[1] 1080 lea (%rsi,%r10),%r10 # end of alloca label 1081 cmp %r10,%rbx # context->Rip<end of alloca label 1082 jb .Lcommon_seh_tail 1083 1084 mov 152($context),%rax # pull context->Rsp 1085 1086 mov 8(%r11),%r10d # HandlerData[2] 1087 lea (%rsi,%r10),%r10 # epilogue label 1088 cmp %r10,%rbx # context->Rip>=epilogue label 1089 jae .Lcommon_seh_tail 1090 1091 mov 192($context),%r10 # pull $num 1092 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 1093 1094 lea 48(%rax),%rax 1095 1096 mov -8(%rax),%rbx 1097 mov -16(%rax),%rbp 1098 mov -24(%rax),%r12 1099 mov -32(%rax),%r13 1100 mov -40(%rax),%r14 1101 mov -48(%rax),%r15 1102 mov %rbx,144($context) # restore context->Rbx 1103 mov %rbp,160($context) # restore context->Rbp 1104 mov %r12,216($context) # restore context->R12 1105 mov %r13,224($context) # restore context->R13 1106 mov %r14,232($context) # restore context->R14 1107 mov %r15,240($context) # restore context->R15 1108 1109.Lcommon_seh_tail: 1110 mov 8(%rax),%rdi 1111 mov 16(%rax),%rsi 1112 mov %rax,152($context) # restore context->Rsp 1113 mov %rsi,168($context) # restore context->Rsi 1114 mov %rdi,176($context) # restore context->Rdi 1115 1116 mov 40($disp),%rdi # disp->ContextRecord 1117 mov $context,%rsi # context 1118 mov \$154,%ecx # sizeof(CONTEXT) 1119 .long 0xa548f3fc # cld; rep movsq 1120 1121 mov $disp,%rsi 1122 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1123 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1124 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1125 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1126 mov 40(%rsi),%r10 # disp->ContextRecord 1127 lea 56(%rsi),%r11 # &disp->HandlerData 1128 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1129 mov %r10,32(%rsp) # arg5 1130 mov %r11,40(%rsp) # arg6 1131 mov %r12,48(%rsp) # arg7 1132 mov %rcx,56(%rsp) # arg8, (NULL) 1133 call *__imp_RtlVirtualUnwind(%rip) 1134 1135 mov \$1,%eax # ExceptionContinueSearch 1136 add \$64,%rsp 1137 popfq 1138 pop %r15 1139 pop %r14 1140 pop %r13 1141 pop %r12 1142 pop %rbp 1143 pop %rbx 1144 pop %rdi 1145 pop %rsi 1146 ret 1147.size mul_handler,.-mul_handler 1148 1149.section .pdata 1150.align 4 1151 .rva .LSEH_begin_bn_mul_mont_gather5 1152 .rva .LSEH_end_bn_mul_mont_gather5 1153 .rva .LSEH_info_bn_mul_mont_gather5 1154 1155 .rva .LSEH_begin_bn_mul4x_mont_gather5 1156 .rva .LSEH_end_bn_mul4x_mont_gather5 1157 .rva .LSEH_info_bn_mul4x_mont_gather5 1158 1159 .rva .LSEH_begin_bn_gather5 1160 .rva .LSEH_end_bn_gather5 1161 .rva .LSEH_info_bn_gather5 1162 1163.section .xdata 1164.align 8 1165.LSEH_info_bn_mul_mont_gather5: 1166 .byte 9,0,0,0 1167 .rva mul_handler 1168 .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[] 1169.align 8 1170.LSEH_info_bn_mul4x_mont_gather5: 1171 .byte 9,0,0,0 1172 .rva mul_handler 1173 .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 1174.align 8 1175.LSEH_info_bn_gather5: 1176 .byte 0x01,0x0b,0x03,0x0a 1177 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 1178 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp), set_frame r10 1179.align 8 1180___ 1181} 1182 1183$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1184 1185print $code; 1186close STDOUT; 1187