1#! /usr/bin/env perl 2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# Multi-buffer SHA256 procedure processes n buffers in parallel by 18# placing buffer data to designated lane of SIMD register. n is 19# naturally limited to 4 on pre-AVX2 processors and to 8 on 20# AVX2-capable processors such as Haswell. 21# 22# this +aesni(i) sha256 aesni-sha256 gain(iv) 23# ------------------------------------------------------------------- 24# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126% 25# Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95% 26# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103% 27# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82% 28# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170% 29# Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170% 30# Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100% 31# 32# (i) multi-block CBC encrypt with 128-bit key; 33# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, 34# because of lower AES-NI instruction throughput, nor is there 35# AES-NI-SHA256 stitch for these processors; 36# (iii) "this" is for n=8, when we gather twice as much data, result 37# for n=4 is 20.3+4.44=24.7; 38# (iv) presented improvement coefficients are asymptotic limits and 39# in real-life application are somewhat lower, e.g. for 2KB 40# fragments they range from 75% to 130% (on Haswell); 41 42# $output is the last argument if it looks like a file (it has an extension) 43# $flavour is the first argument if it doesn't look like a file 44$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 45$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 46 47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 48 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 52die "can't locate x86_64-xlate.pl"; 53 54push(@INC,"${dir}","${dir}../../perlasm"); 55require "x86_64-support.pl"; 56 57$ptr_size=&pointer_size($flavour); 58 59$avx=0; 60 61if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 62 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 63 $avx = ($1>=2.19) + ($1>=2.22); 64} 65 66if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 67 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 68 $avx = ($1>=2.09) + ($1>=2.10); 69} 70 71if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 72 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 73 $avx = ($1>=10) + ($1>=11); 74} 75 76if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 77 $avx = ($2>=3.0) + ($2>3.0); 78} 79 80open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 81 or die "can't call $xlate: $!"; 82*STDOUT=*OUT; 83 84# void sha256_multi_block ( 85# struct { unsigned int A[8]; 86# unsigned int B[8]; 87# unsigned int C[8]; 88# unsigned int D[8]; 89# unsigned int E[8]; 90# unsigned int F[8]; 91# unsigned int G[8]; 92# unsigned int H[8]; } *ctx, 93# struct { void *ptr; int blocks; } inp[8], 94# int num); /* 1 or 2 */ 95# 96$ctx="%rdi"; # 1st arg 97$inp="%rsi"; # 2nd arg 98$num="%edx"; # 3rd arg 99@ptr=map("%r$_",(8..11)); 100$Tbl="%rbp"; 101$inp_elm_size=2*$ptr_size; 102 103@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15)); 104($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7)); 105 106$REG_SZ=16; 107 108sub Xi_off { 109my $off = shift; 110 111 $off %= 16; $off *= $REG_SZ; 112 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; 113} 114 115sub ROUND_00_15 { 116my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 117 118$code.=<<___ if ($i<15); 119 movd `4*$i`(@ptr[0]),$Xi 120 movd `4*$i`(@ptr[1]),$t1 121 movd `4*$i`(@ptr[2]),$t2 122 movd `4*$i`(@ptr[3]),$t3 123 punpckldq $t2,$Xi 124 punpckldq $t3,$t1 125 punpckldq $t1,$Xi 126___ 127$code.=<<___ if ($i==15); 128 movd `4*$i`(@ptr[0]),$Xi 129 lea `16*4`(@ptr[0]),@ptr[0] 130 movd `4*$i`(@ptr[1]),$t1 131 lea `16*4`(@ptr[1]),@ptr[1] 132 movd `4*$i`(@ptr[2]),$t2 133 lea `16*4`(@ptr[2]),@ptr[2] 134 movd `4*$i`(@ptr[3]),$t3 135 lea `16*4`(@ptr[3]),@ptr[3] 136 punpckldq $t2,$Xi 137 punpckldq $t3,$t1 138 punpckldq $t1,$Xi 139___ 140$code.=<<___; 141 movdqa $e,$sigma 142 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)` 143 movdqa $e,$t3 144 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)` 145 psrld \$6,$sigma 146 movdqa $e,$t2 147 pslld \$7,$t3 148 movdqa $Xi,`&Xi_off($i)` 149 paddd $h,$Xi # Xi+=h 150 151 psrld \$11,$t2 152 pxor $t3,$sigma 153 pslld \$21-7,$t3 154 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round] 155 pxor $t2,$sigma 156 157 psrld \$25-11,$t2 158 movdqa $e,$t1 159 `"prefetcht0 63(@ptr[0])" if ($i==15)` 160 pxor $t3,$sigma 161 movdqa $e,$axb # borrow $axb 162 pslld \$26-21,$t3 163 pandn $g,$t1 164 pand $f,$axb 165 pxor $t2,$sigma 166 167 `"prefetcht0 63(@ptr[1])" if ($i==15)` 168 movdqa $a,$t2 169 pxor $t3,$sigma # Sigma1(e) 170 movdqa $a,$t3 171 psrld \$2,$t2 172 paddd $sigma,$Xi # Xi+=Sigma1(e) 173 pxor $axb,$t1 # Ch(e,f,g) 174 movdqa $b,$axb 175 movdqa $a,$sigma 176 pslld \$10,$t3 177 pxor $a,$axb # a^b, b^c in next round 178 179 `"prefetcht0 63(@ptr[2])" if ($i==15)` 180 psrld \$13,$sigma 181 pxor $t3,$t2 182 paddd $t1,$Xi # Xi+=Ch(e,f,g) 183 pslld \$19-10,$t3 184 pand $axb,$bxc 185 pxor $sigma,$t2 186 187 `"prefetcht0 63(@ptr[3])" if ($i==15)` 188 psrld \$22-13,$sigma 189 pxor $t3,$t2 190 movdqa $b,$h 191 pslld \$30-19,$t3 192 pxor $t2,$sigma 193 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 194 paddd $Xi,$d # d+=Xi 195 pxor $t3,$sigma # Sigma0(a) 196 197 paddd $Xi,$h # h+=Xi 198 paddd $sigma,$h # h+=Sigma0(a) 199___ 200$code.=<<___ if (($i%8)==7); 201 lea `32*8`($Tbl),$Tbl 202___ 203 ($axb,$bxc)=($bxc,$axb); 204} 205 206sub ROUND_16_XX { 207my $i=shift; 208 209$code.=<<___; 210 movdqa `&Xi_off($i+1)`,$Xn 211 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9] 212 213 movdqa $Xn,$sigma 214 movdqa $Xn,$t2 215 psrld \$3,$sigma 216 movdqa $Xn,$t3 217 218 psrld \$7,$t2 219 movdqa `&Xi_off($i+14)`,$t1 220 pslld \$14,$t3 221 pxor $t2,$sigma 222 psrld \$18-7,$t2 223 movdqa $t1,$axb # borrow $axb 224 pxor $t3,$sigma 225 pslld \$25-14,$t3 226 pxor $t2,$sigma 227 psrld \$10,$t1 228 movdqa $axb,$t2 229 230 psrld \$17,$axb 231 pxor $t3,$sigma # sigma0(X[i+1]) 232 pslld \$13,$t2 233 paddd $sigma,$Xi # Xi+=sigma0(e) 234 pxor $axb,$t1 235 psrld \$19-17,$axb 236 pxor $t2,$t1 237 pslld \$15-13,$t2 238 pxor $axb,$t1 239 pxor $t2,$t1 # sigma0(X[i+14]) 240 paddd $t1,$Xi # Xi+=sigma1(X[i+14]) 241___ 242 &ROUND_00_15($i,@_); 243 ($Xi,$Xn)=($Xn,$Xi); 244} 245 246$code.=<<___; 247.text 248 249.extern OPENSSL_ia32cap_P 250 251.globl sha256_multi_block 252.type sha256_multi_block,\@function,3 253.align 32 254sha256_multi_block: 255.cfi_startproc 256 mov OPENSSL_ia32cap_P+4(%rip),%rcx 257 bt \$61,%rcx # check SHA bit 258 jc _shaext_shortcut 259___ 260$code.=<<___ if ($avx); 261 test \$`1<<28`,%ecx 262 jnz _avx_shortcut 263___ 264$code.=<<___; 265 mov %rsp,%rax 266.cfi_def_cfa_register %rax 267 push %rbx 268.cfi_push %rbx 269 push %rbp 270.cfi_push %rbp 271___ 272$code.=<<___ if ($win64); 273 lea -0xa8(%rsp),%rsp 274 movaps %xmm6,(%rsp) 275 movaps %xmm7,0x10(%rsp) 276 movaps %xmm8,0x20(%rsp) 277 movaps %xmm9,0x30(%rsp) 278 movaps %xmm10,-0x78(%rax) 279 movaps %xmm11,-0x68(%rax) 280 movaps %xmm12,-0x58(%rax) 281 movaps %xmm13,-0x48(%rax) 282 movaps %xmm14,-0x38(%rax) 283 movaps %xmm15,-0x28(%rax) 284___ 285$code.=<<___; 286 sub \$`$REG_SZ*18`, %rsp 287 and \$-256,%rsp 288 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 289.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 290.Lbody: 291 lea K256+128(%rip),$Tbl 292 lea `$REG_SZ*16`(%rsp),%rbx 293 lea 0x80($ctx),$ctx # size optimization 294 295.Loop_grande: 296 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 297 xor $num,$num 298___ 299for($i=0;$i<4;$i++) { 300 $ptr_reg=&pointer_register($flavour,@ptr[$i]); 301 $code.=<<___; 302 # input pointer 303 mov `$inp_elm_size*$i+0`($inp),$ptr_reg 304 # number of blocks 305 mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx 306 cmp $num,%ecx 307 cmovg %ecx,$num # find maximum 308 test %ecx,%ecx 309 mov %ecx,`4*$i`(%rbx) # initialize counters 310 cmovle $Tbl,@ptr[$i] # cancel input 311___ 312} 313$code.=<<___; 314 test $num,$num 315 jz .Ldone 316 317 movdqu 0x00-0x80($ctx),$A # load context 318 lea 128(%rsp),%rax 319 movdqu 0x20-0x80($ctx),$B 320 movdqu 0x40-0x80($ctx),$C 321 movdqu 0x60-0x80($ctx),$D 322 movdqu 0x80-0x80($ctx),$E 323 movdqu 0xa0-0x80($ctx),$F 324 movdqu 0xc0-0x80($ctx),$G 325 movdqu 0xe0-0x80($ctx),$H 326 movdqu .Lpbswap(%rip),$Xn 327 jmp .Loop 328 329.align 32 330.Loop: 331 movdqa $C,$bxc 332 pxor $B,$bxc # magic seed 333___ 334for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } 335$code.=<<___; 336 movdqu `&Xi_off($i)`,$Xi 337 mov \$3,%ecx 338 jmp .Loop_16_xx 339.align 32 340.Loop_16_xx: 341___ 342for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); } 343$code.=<<___; 344 dec %ecx 345 jnz .Loop_16_xx 346 347 mov \$1,%ecx 348 lea K256+128(%rip),$Tbl 349 350 movdqa (%rbx),$sigma # pull counters 351 cmp 4*0(%rbx),%ecx # examine counters 352 pxor $t1,$t1 353 cmovge $Tbl,@ptr[0] # cancel input 354 cmp 4*1(%rbx),%ecx 355 movdqa $sigma,$Xn 356 cmovge $Tbl,@ptr[1] 357 cmp 4*2(%rbx),%ecx 358 pcmpgtd $t1,$Xn # mask value 359 cmovge $Tbl,@ptr[2] 360 cmp 4*3(%rbx),%ecx 361 paddd $Xn,$sigma # counters-- 362 cmovge $Tbl,@ptr[3] 363 364 movdqu 0x00-0x80($ctx),$t1 365 pand $Xn,$A 366 movdqu 0x20-0x80($ctx),$t2 367 pand $Xn,$B 368 movdqu 0x40-0x80($ctx),$t3 369 pand $Xn,$C 370 movdqu 0x60-0x80($ctx),$Xi 371 pand $Xn,$D 372 paddd $t1,$A 373 movdqu 0x80-0x80($ctx),$t1 374 pand $Xn,$E 375 paddd $t2,$B 376 movdqu 0xa0-0x80($ctx),$t2 377 pand $Xn,$F 378 paddd $t3,$C 379 movdqu 0xc0-0x80($ctx),$t3 380 pand $Xn,$G 381 paddd $Xi,$D 382 movdqu 0xe0-0x80($ctx),$Xi 383 pand $Xn,$H 384 paddd $t1,$E 385 paddd $t2,$F 386 movdqu $A,0x00-0x80($ctx) 387 paddd $t3,$G 388 movdqu $B,0x20-0x80($ctx) 389 paddd $Xi,$H 390 movdqu $C,0x40-0x80($ctx) 391 movdqu $D,0x60-0x80($ctx) 392 movdqu $E,0x80-0x80($ctx) 393 movdqu $F,0xa0-0x80($ctx) 394 movdqu $G,0xc0-0x80($ctx) 395 movdqu $H,0xe0-0x80($ctx) 396 397 movdqa $sigma,(%rbx) # save counters 398 movdqa .Lpbswap(%rip),$Xn 399 dec $num 400 jnz .Loop 401 402 mov `$REG_SZ*17+8`(%rsp),$num 403 lea $REG_SZ($ctx),$ctx 404 lea `$inp_elm_size*$REG_SZ/4`($inp),$inp 405 dec $num 406 jnz .Loop_grande 407 408.Ldone: 409 mov `$REG_SZ*17`(%rsp),%rax # original %rsp 410.cfi_def_cfa %rax,8 411___ 412$code.=<<___ if ($win64); 413 movaps -0xb8(%rax),%xmm6 414 movaps -0xa8(%rax),%xmm7 415 movaps -0x98(%rax),%xmm8 416 movaps -0x88(%rax),%xmm9 417 movaps -0x78(%rax),%xmm10 418 movaps -0x68(%rax),%xmm11 419 movaps -0x58(%rax),%xmm12 420 movaps -0x48(%rax),%xmm13 421 movaps -0x38(%rax),%xmm14 422 movaps -0x28(%rax),%xmm15 423___ 424$code.=<<___; 425 mov -16(%rax),%rbp 426.cfi_restore %rbp 427 mov -8(%rax),%rbx 428.cfi_restore %rbx 429 lea (%rax),%rsp 430.cfi_def_cfa_register %rsp 431.Lepilogue: 432 ret 433.cfi_endproc 434.size sha256_multi_block,.-sha256_multi_block 435___ 436 {{{ 437my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15)); 438my @MSG0=map("%xmm$_",(4..7)); 439my @MSG1=map("%xmm$_",(8..11)); 440 441$code.=<<___; 442.type sha256_multi_block_shaext,\@function,3 443.align 32 444sha256_multi_block_shaext: 445.cfi_startproc 446_shaext_shortcut: 447 mov %rsp,%rax 448.cfi_def_cfa_register %rax 449 push %rbx 450.cfi_push %rbx 451 push %rbp 452.cfi_push %rbp 453___ 454$code.=<<___ if ($win64); 455 lea -0xa8(%rsp),%rsp 456 movaps %xmm6,(%rsp) 457 movaps %xmm7,0x10(%rsp) 458 movaps %xmm8,0x20(%rsp) 459 movaps %xmm9,0x30(%rsp) 460 movaps %xmm10,-0x78(%rax) 461 movaps %xmm11,-0x68(%rax) 462 movaps %xmm12,-0x58(%rax) 463 movaps %xmm13,-0x48(%rax) 464 movaps %xmm14,-0x38(%rax) 465 movaps %xmm15,-0x28(%rax) 466___ 467$code.=<<___; 468 sub \$`$REG_SZ*18`,%rsp 469 shl \$1,$num # we process pair at a time 470 and \$-256,%rsp 471 lea 0x80($ctx),$ctx # size optimization 472 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 473.Lbody_shaext: 474 lea `$REG_SZ*16`(%rsp),%rbx 475 lea K256_shaext+0x80(%rip),$Tbl 476 477.Loop_grande_shaext: 478 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 479 xor $num,$num 480___ 481for($i=0;$i<2;$i++) { 482 $ptr_reg=&pointer_register($flavour,@ptr[$i]); 483 $code.=<<___; 484 # input pointer 485 mov `$inp_elm_size*$i+0`($inp),$ptr_reg 486 # number of blocks 487 mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx 488 cmp $num,%ecx 489 cmovg %ecx,$num # find maximum 490 test %ecx,%ecx 491 mov %ecx,`4*$i`(%rbx) # initialize counters 492 cmovle %rsp,@ptr[$i] # cancel input 493___ 494} 495$code.=<<___; 496 test $num,$num 497 jz .Ldone_shaext 498 499 movq 0x00-0x80($ctx),$ABEF0 # A1.A0 500 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0 501 movq 0x40-0x80($ctx),$CDGH0 # C1.C0 502 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0 503 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0 504 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0 505 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0 506 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0 507 508 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0 509 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0 510 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0 511 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0 512 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap 513 514 movdqa $ABEF0,$ABEF1 515 movdqa $CDGH0,$CDGH1 516 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0 517 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0 518 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1 519 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1 520 521 pshufd \$0b00011011,$ABEF0,$ABEF0 522 pshufd \$0b00011011,$CDGH0,$CDGH0 523 pshufd \$0b00011011,$ABEF1,$ABEF1 524 pshufd \$0b00011011,$CDGH1,$CDGH1 525 jmp .Loop_shaext 526 527.align 32 528.Loop_shaext: 529 movdqu 0x00(@ptr[0]),@MSG0[0] 530 movdqu 0x00(@ptr[1]),@MSG1[0] 531 movdqu 0x10(@ptr[0]),@MSG0[1] 532 movdqu 0x10(@ptr[1]),@MSG1[1] 533 movdqu 0x20(@ptr[0]),@MSG0[2] 534 pshufb $TMPx,@MSG0[0] 535 movdqu 0x20(@ptr[1]),@MSG1[2] 536 pshufb $TMPx,@MSG1[0] 537 movdqu 0x30(@ptr[0]),@MSG0[3] 538 lea 0x40(@ptr[0]),@ptr[0] 539 movdqu 0x30(@ptr[1]),@MSG1[3] 540 lea 0x40(@ptr[1]),@ptr[1] 541 542 movdqa 0*16-0x80($Tbl),$Wi 543 pshufb $TMPx,@MSG0[1] 544 paddd @MSG0[0],$Wi 545 pxor $ABEF0,@MSG0[0] # black magic 546 movdqa $Wi,$TMP0 547 movdqa 0*16-0x80($Tbl),$TMP1 548 pshufb $TMPx,@MSG1[1] 549 paddd @MSG1[0],$TMP1 550 movdqa $CDGH0,0x50(%rsp) # offload 551 sha256rnds2 $ABEF0,$CDGH0 # 0-3 552 pxor $ABEF1,@MSG1[0] # black magic 553 movdqa $TMP1,$Wi 554 movdqa $CDGH1,0x70(%rsp) 555 sha256rnds2 $ABEF1,$CDGH1 # 0-3 556 pshufd \$0x0e,$TMP0,$Wi 557 pxor $ABEF0,@MSG0[0] # black magic 558 movdqa $ABEF0,0x40(%rsp) # offload 559 sha256rnds2 $CDGH0,$ABEF0 560 pshufd \$0x0e,$TMP1,$Wi 561 pxor $ABEF1,@MSG1[0] # black magic 562 movdqa $ABEF1,0x60(%rsp) 563 movdqa 1*16-0x80($Tbl),$TMP0 564 paddd @MSG0[1],$TMP0 565 pshufb $TMPx,@MSG0[2] 566 sha256rnds2 $CDGH1,$ABEF1 567 568 movdqa $TMP0,$Wi 569 movdqa 1*16-0x80($Tbl),$TMP1 570 paddd @MSG1[1],$TMP1 571 sha256rnds2 $ABEF0,$CDGH0 # 4-7 572 movdqa $TMP1,$Wi 573 prefetcht0 127(@ptr[0]) 574 pshufb $TMPx,@MSG0[3] 575 pshufb $TMPx,@MSG1[2] 576 prefetcht0 127(@ptr[1]) 577 sha256rnds2 $ABEF1,$CDGH1 # 4-7 578 pshufd \$0x0e,$TMP0,$Wi 579 pshufb $TMPx,@MSG1[3] 580 sha256msg1 @MSG0[1],@MSG0[0] 581 sha256rnds2 $CDGH0,$ABEF0 582 pshufd \$0x0e,$TMP1,$Wi 583 movdqa 2*16-0x80($Tbl),$TMP0 584 paddd @MSG0[2],$TMP0 585 sha256rnds2 $CDGH1,$ABEF1 586 587 movdqa $TMP0,$Wi 588 movdqa 2*16-0x80($Tbl),$TMP1 589 paddd @MSG1[2],$TMP1 590 sha256rnds2 $ABEF0,$CDGH0 # 8-11 591 sha256msg1 @MSG1[1],@MSG1[0] 592 movdqa $TMP1,$Wi 593 movdqa @MSG0[3],$TMPx 594 sha256rnds2 $ABEF1,$CDGH1 # 8-11 595 pshufd \$0x0e,$TMP0,$Wi 596 palignr \$4,@MSG0[2],$TMPx 597 paddd $TMPx,@MSG0[0] 598 movdqa @MSG1[3],$TMPx 599 palignr \$4,@MSG1[2],$TMPx 600 sha256msg1 @MSG0[2],@MSG0[1] 601 sha256rnds2 $CDGH0,$ABEF0 602 pshufd \$0x0e,$TMP1,$Wi 603 movdqa 3*16-0x80($Tbl),$TMP0 604 paddd @MSG0[3],$TMP0 605 sha256rnds2 $CDGH1,$ABEF1 606 sha256msg1 @MSG1[2],@MSG1[1] 607 608 movdqa $TMP0,$Wi 609 movdqa 3*16-0x80($Tbl),$TMP1 610 paddd $TMPx,@MSG1[0] 611 paddd @MSG1[3],$TMP1 612 sha256msg2 @MSG0[3],@MSG0[0] 613 sha256rnds2 $ABEF0,$CDGH0 # 12-15 614 movdqa $TMP1,$Wi 615 movdqa @MSG0[0],$TMPx 616 palignr \$4,@MSG0[3],$TMPx 617 sha256rnds2 $ABEF1,$CDGH1 # 12-15 618 sha256msg2 @MSG1[3],@MSG1[0] 619 pshufd \$0x0e,$TMP0,$Wi 620 paddd $TMPx,@MSG0[1] 621 movdqa @MSG1[0],$TMPx 622 palignr \$4,@MSG1[3],$TMPx 623 sha256msg1 @MSG0[3],@MSG0[2] 624 sha256rnds2 $CDGH0,$ABEF0 625 pshufd \$0x0e,$TMP1,$Wi 626 movdqa 4*16-0x80($Tbl),$TMP0 627 paddd @MSG0[0],$TMP0 628 sha256rnds2 $CDGH1,$ABEF1 629 sha256msg1 @MSG1[3],@MSG1[2] 630___ 631for($i=4;$i<16-3;$i++) { 632$code.=<<___; 633 movdqa $TMP0,$Wi 634 movdqa $i*16-0x80($Tbl),$TMP1 635 paddd $TMPx,@MSG1[1] 636 paddd @MSG1[0],$TMP1 637 sha256msg2 @MSG0[0],@MSG0[1] 638 sha256rnds2 $ABEF0,$CDGH0 # 16-19... 639 movdqa $TMP1,$Wi 640 movdqa @MSG0[1],$TMPx 641 palignr \$4,@MSG0[0],$TMPx 642 sha256rnds2 $ABEF1,$CDGH1 # 16-19... 643 sha256msg2 @MSG1[0],@MSG1[1] 644 pshufd \$0x0e,$TMP0,$Wi 645 paddd $TMPx,@MSG0[2] 646 movdqa @MSG1[1],$TMPx 647 palignr \$4,@MSG1[0],$TMPx 648 sha256msg1 @MSG0[0],@MSG0[3] 649 sha256rnds2 $CDGH0,$ABEF0 650 pshufd \$0x0e,$TMP1,$Wi 651 movdqa `($i+1)*16`-0x80($Tbl),$TMP0 652 paddd @MSG0[1],$TMP0 653 sha256rnds2 $CDGH1,$ABEF1 654 sha256msg1 @MSG1[0],@MSG1[3] 655___ 656 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); 657} 658$code.=<<___; 659 movdqa $TMP0,$Wi 660 movdqa 13*16-0x80($Tbl),$TMP1 661 paddd $TMPx,@MSG1[1] 662 paddd @MSG1[0],$TMP1 663 sha256msg2 @MSG0[0],@MSG0[1] 664 sha256rnds2 $ABEF0,$CDGH0 # 52-55 665 movdqa $TMP1,$Wi 666 movdqa @MSG0[1],$TMPx 667 palignr \$4,@MSG0[0],$TMPx 668 sha256rnds2 $ABEF1,$CDGH1 # 52-55 669 sha256msg2 @MSG1[0],@MSG1[1] 670 pshufd \$0x0e,$TMP0,$Wi 671 paddd $TMPx,@MSG0[2] 672 movdqa @MSG1[1],$TMPx 673 palignr \$4,@MSG1[0],$TMPx 674 nop 675 sha256rnds2 $CDGH0,$ABEF0 676 pshufd \$0x0e,$TMP1,$Wi 677 movdqa 14*16-0x80($Tbl),$TMP0 678 paddd @MSG0[1],$TMP0 679 sha256rnds2 $CDGH1,$ABEF1 680 681 movdqa $TMP0,$Wi 682 movdqa 14*16-0x80($Tbl),$TMP1 683 paddd $TMPx,@MSG1[2] 684 paddd @MSG1[1],$TMP1 685 sha256msg2 @MSG0[1],@MSG0[2] 686 nop 687 sha256rnds2 $ABEF0,$CDGH0 # 56-59 688 movdqa $TMP1,$Wi 689 mov \$1,%ecx 690 pxor @MSG0[1],@MSG0[1] # zero 691 sha256rnds2 $ABEF1,$CDGH1 # 56-59 692 sha256msg2 @MSG1[1],@MSG1[2] 693 pshufd \$0x0e,$TMP0,$Wi 694 movdqa 15*16-0x80($Tbl),$TMP0 695 paddd @MSG0[2],$TMP0 696 movq (%rbx),@MSG0[2] # pull counters 697 nop 698 sha256rnds2 $CDGH0,$ABEF0 699 pshufd \$0x0e,$TMP1,$Wi 700 movdqa 15*16-0x80($Tbl),$TMP1 701 paddd @MSG1[2],$TMP1 702 sha256rnds2 $CDGH1,$ABEF1 703 704 movdqa $TMP0,$Wi 705 cmp 4*0(%rbx),%ecx # examine counters 706 cmovge %rsp,@ptr[0] # cancel input 707 cmp 4*1(%rbx),%ecx 708 cmovge %rsp,@ptr[1] 709 pshufd \$0x00,@MSG0[2],@MSG1[0] 710 sha256rnds2 $ABEF0,$CDGH0 # 60-63 711 movdqa $TMP1,$Wi 712 pshufd \$0x55,@MSG0[2],@MSG1[1] 713 movdqa @MSG0[2],@MSG1[2] 714 sha256rnds2 $ABEF1,$CDGH1 # 60-63 715 pshufd \$0x0e,$TMP0,$Wi 716 pcmpgtd @MSG0[1],@MSG1[0] 717 pcmpgtd @MSG0[1],@MSG1[1] 718 sha256rnds2 $CDGH0,$ABEF0 719 pshufd \$0x0e,$TMP1,$Wi 720 pcmpgtd @MSG0[1],@MSG1[2] # counter mask 721 movdqa K256_shaext-0x10(%rip),$TMPx 722 sha256rnds2 $CDGH1,$ABEF1 723 724 pand @MSG1[0],$CDGH0 725 pand @MSG1[1],$CDGH1 726 pand @MSG1[0],$ABEF0 727 pand @MSG1[1],$ABEF1 728 paddd @MSG0[2],@MSG1[2] # counters-- 729 730 paddd 0x50(%rsp),$CDGH0 731 paddd 0x70(%rsp),$CDGH1 732 paddd 0x40(%rsp),$ABEF0 733 paddd 0x60(%rsp),$ABEF1 734 735 movq @MSG1[2],(%rbx) # save counters 736 dec $num 737 jnz .Loop_shaext 738 739 mov `$REG_SZ*17+8`(%rsp),$num 740 741 pshufd \$0b00011011,$ABEF0,$ABEF0 742 pshufd \$0b00011011,$CDGH0,$CDGH0 743 pshufd \$0b00011011,$ABEF1,$ABEF1 744 pshufd \$0b00011011,$CDGH1,$CDGH1 745 746 movdqa $ABEF0,@MSG0[0] 747 movdqa $CDGH0,@MSG0[1] 748 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0 749 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0 750 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0 751 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0 752 753 movq $ABEF0,0x00-0x80($ctx) # A1.A0 754 psrldq \$8,$ABEF0 755 movq @MSG0[0],0x80-0x80($ctx) # E1.E0 756 psrldq \$8,@MSG0[0] 757 movq $ABEF0,0x20-0x80($ctx) # B1.B0 758 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0 759 760 movq $CDGH0,0x40-0x80($ctx) # C1.C0 761 psrldq \$8,$CDGH0 762 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0 763 psrldq \$8,@MSG0[1] 764 movq $CDGH0,0x60-0x80($ctx) # D1.D0 765 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0 766 767 lea `$REG_SZ/2`($ctx),$ctx 768 lea `$inp_elm_size*2`($inp),$inp 769 dec $num 770 jnz .Loop_grande_shaext 771 772.Ldone_shaext: 773 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp 774___ 775$code.=<<___ if ($win64); 776 movaps -0xb8(%rax),%xmm6 777 movaps -0xa8(%rax),%xmm7 778 movaps -0x98(%rax),%xmm8 779 movaps -0x88(%rax),%xmm9 780 movaps -0x78(%rax),%xmm10 781 movaps -0x68(%rax),%xmm11 782 movaps -0x58(%rax),%xmm12 783 movaps -0x48(%rax),%xmm13 784 movaps -0x38(%rax),%xmm14 785 movaps -0x28(%rax),%xmm15 786___ 787$code.=<<___; 788 mov -16(%rax),%rbp 789.cfi_restore %rbp 790 mov -8(%rax),%rbx 791.cfi_restore %rbx 792 lea (%rax),%rsp 793.cfi_def_cfa_register %rsp 794.Lepilogue_shaext: 795 ret 796.cfi_endproc 797.size sha256_multi_block_shaext,.-sha256_multi_block_shaext 798___ 799 }}} 800 if ($avx) {{{ 801sub ROUND_00_15_avx { 802my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 803 804$code.=<<___ if ($i<15 && $REG_SZ==16); 805 vmovd `4*$i`(@ptr[0]),$Xi 806 vmovd `4*$i`(@ptr[1]),$t1 807 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi 808 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 809 vpunpckldq $t1,$Xi,$Xi 810 vpshufb $Xn,$Xi,$Xi 811___ 812$code.=<<___ if ($i==15 && $REG_SZ==16); 813 vmovd `4*$i`(@ptr[0]),$Xi 814 lea `16*4`(@ptr[0]),@ptr[0] 815 vmovd `4*$i`(@ptr[1]),$t1 816 lea `16*4`(@ptr[1]),@ptr[1] 817 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi 818 lea `16*4`(@ptr[2]),@ptr[2] 819 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 820 lea `16*4`(@ptr[3]),@ptr[3] 821 vpunpckldq $t1,$Xi,$Xi 822 vpshufb $Xn,$Xi,$Xi 823___ 824$code.=<<___ if ($i<15 && $REG_SZ==32); 825 vmovd `4*$i`(@ptr[0]),$Xi 826 vmovd `4*$i`(@ptr[4]),$t1 827 vmovd `4*$i`(@ptr[1]),$t2 828 vmovd `4*$i`(@ptr[5]),$t3 829 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi 830 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 831 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 832 vpunpckldq $t2,$Xi,$Xi 833 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 834 vpunpckldq $t3,$t1,$t1 835 vinserti128 $t1,$Xi,$Xi 836 vpshufb $Xn,$Xi,$Xi 837___ 838$code.=<<___ if ($i==15 && $REG_SZ==32); 839 vmovd `4*$i`(@ptr[0]),$Xi 840 lea `16*4`(@ptr[0]),@ptr[0] 841 vmovd `4*$i`(@ptr[4]),$t1 842 lea `16*4`(@ptr[4]),@ptr[4] 843 vmovd `4*$i`(@ptr[1]),$t2 844 lea `16*4`(@ptr[1]),@ptr[1] 845 vmovd `4*$i`(@ptr[5]),$t3 846 lea `16*4`(@ptr[5]),@ptr[5] 847 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi 848 lea `16*4`(@ptr[2]),@ptr[2] 849 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 850 lea `16*4`(@ptr[6]),@ptr[6] 851 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 852 lea `16*4`(@ptr[3]),@ptr[3] 853 vpunpckldq $t2,$Xi,$Xi 854 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 855 lea `16*4`(@ptr[7]),@ptr[7] 856 vpunpckldq $t3,$t1,$t1 857 vinserti128 $t1,$Xi,$Xi 858 vpshufb $Xn,$Xi,$Xi 859___ 860$code.=<<___; 861 vpsrld \$6,$e,$sigma 862 vpslld \$26,$e,$t3 863 vmovdqu $Xi,`&Xi_off($i)` 864 vpaddd $h,$Xi,$Xi # Xi+=h 865 866 vpsrld \$11,$e,$t2 867 vpxor $t3,$sigma,$sigma 868 vpslld \$21,$e,$t3 869 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round] 870 vpxor $t2,$sigma,$sigma 871 872 vpsrld \$25,$e,$t2 873 vpxor $t3,$sigma,$sigma 874 `"prefetcht0 63(@ptr[0])" if ($i==15)` 875 vpslld \$7,$e,$t3 876 vpandn $g,$e,$t1 877 vpand $f,$e,$axb # borrow $axb 878 `"prefetcht0 63(@ptr[1])" if ($i==15)` 879 vpxor $t2,$sigma,$sigma 880 881 vpsrld \$2,$a,$h # borrow $h 882 vpxor $t3,$sigma,$sigma # Sigma1(e) 883 `"prefetcht0 63(@ptr[2])" if ($i==15)` 884 vpslld \$30,$a,$t2 885 vpxor $axb,$t1,$t1 # Ch(e,f,g) 886 vpxor $a,$b,$axb # a^b, b^c in next round 887 `"prefetcht0 63(@ptr[3])" if ($i==15)` 888 vpxor $t2,$h,$h 889 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e) 890 891 vpsrld \$13,$a,$t2 892 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` 893 vpslld \$19,$a,$t3 894 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g) 895 vpand $axb,$bxc,$bxc 896 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` 897 vpxor $t2,$h,$sigma 898 899 vpsrld \$22,$a,$t2 900 vpxor $t3,$sigma,$sigma 901 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` 902 vpslld \$10,$a,$t3 903 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 904 vpaddd $Xi,$d,$d # d+=Xi 905 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` 906 vpxor $t2,$sigma,$sigma 907 vpxor $t3,$sigma,$sigma # Sigma0(a) 908 909 vpaddd $Xi,$h,$h # h+=Xi 910 vpaddd $sigma,$h,$h # h+=Sigma0(a) 911___ 912$code.=<<___ if (($i%8)==7); 913 add \$`32*8`,$Tbl 914___ 915 ($axb,$bxc)=($bxc,$axb); 916} 917 918sub ROUND_16_XX_avx { 919my $i=shift; 920 921$code.=<<___; 922 vmovdqu `&Xi_off($i+1)`,$Xn 923 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9] 924 925 vpsrld \$3,$Xn,$sigma 926 vpsrld \$7,$Xn,$t2 927 vpslld \$25,$Xn,$t3 928 vpxor $t2,$sigma,$sigma 929 vpsrld \$18,$Xn,$t2 930 vpxor $t3,$sigma,$sigma 931 vpslld \$14,$Xn,$t3 932 vmovdqu `&Xi_off($i+14)`,$t1 933 vpsrld \$10,$t1,$axb # borrow $axb 934 935 vpxor $t2,$sigma,$sigma 936 vpsrld \$17,$t1,$t2 937 vpxor $t3,$sigma,$sigma # sigma0(X[i+1]) 938 vpslld \$15,$t1,$t3 939 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e) 940 vpxor $t2,$axb,$sigma 941 vpsrld \$19,$t1,$t2 942 vpxor $t3,$sigma,$sigma 943 vpslld \$13,$t1,$t3 944 vpxor $t2,$sigma,$sigma 945 vpxor $t3,$sigma,$sigma # sigma0(X[i+14]) 946 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14]) 947___ 948 &ROUND_00_15_avx($i,@_); 949 ($Xi,$Xn)=($Xn,$Xi); 950} 951 952$code.=<<___; 953.type sha256_multi_block_avx,\@function,3 954.align 32 955sha256_multi_block_avx: 956.cfi_startproc 957_avx_shortcut: 958___ 959$code.=<<___ if ($avx>1); 960 shr \$32,%rcx 961 cmp \$2,$num 962 jb .Lavx 963 test \$`1<<5`,%ecx 964 jnz _avx2_shortcut 965 jmp .Lavx 966.align 32 967.Lavx: 968___ 969$code.=<<___; 970 mov %rsp,%rax 971.cfi_def_cfa_register %rax 972 push %rbx 973.cfi_push %rbx 974 push %rbp 975.cfi_push %rbp 976___ 977$code.=<<___ if ($win64); 978 lea -0xa8(%rsp),%rsp 979 movaps %xmm6,(%rsp) 980 movaps %xmm7,0x10(%rsp) 981 movaps %xmm8,0x20(%rsp) 982 movaps %xmm9,0x30(%rsp) 983 movaps %xmm10,-0x78(%rax) 984 movaps %xmm11,-0x68(%rax) 985 movaps %xmm12,-0x58(%rax) 986 movaps %xmm13,-0x48(%rax) 987 movaps %xmm14,-0x38(%rax) 988 movaps %xmm15,-0x28(%rax) 989___ 990$code.=<<___; 991 sub \$`$REG_SZ*18`, %rsp 992 and \$-256,%rsp 993 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 994.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 995.Lbody_avx: 996 lea K256+128(%rip),$Tbl 997 lea `$REG_SZ*16`(%rsp),%rbx 998 lea 0x80($ctx),$ctx # size optimization 999 1000.Loop_grande_avx: 1001 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 1002 xor $num,$num 1003___ 1004for($i=0;$i<4;$i++) { 1005 $ptr_reg=&pointer_register($flavour,@ptr[$i]); 1006 $code.=<<___; 1007 # input pointer 1008 mov `$inp_elm_size*$i+0`($inp),$ptr_reg 1009 # number of blocks 1010 mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx 1011 cmp $num,%ecx 1012 cmovg %ecx,$num # find maximum 1013 test %ecx,%ecx 1014 mov %ecx,`4*$i`(%rbx) # initialize counters 1015 cmovle $Tbl,@ptr[$i] # cancel input 1016___ 1017} 1018$code.=<<___; 1019 test $num,$num 1020 jz .Ldone_avx 1021 1022 vmovdqu 0x00-0x80($ctx),$A # load context 1023 lea 128(%rsp),%rax 1024 vmovdqu 0x20-0x80($ctx),$B 1025 vmovdqu 0x40-0x80($ctx),$C 1026 vmovdqu 0x60-0x80($ctx),$D 1027 vmovdqu 0x80-0x80($ctx),$E 1028 vmovdqu 0xa0-0x80($ctx),$F 1029 vmovdqu 0xc0-0x80($ctx),$G 1030 vmovdqu 0xe0-0x80($ctx),$H 1031 vmovdqu .Lpbswap(%rip),$Xn 1032 jmp .Loop_avx 1033 1034.align 32 1035.Loop_avx: 1036 vpxor $B,$C,$bxc # magic seed 1037___ 1038for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } 1039$code.=<<___; 1040 vmovdqu `&Xi_off($i)`,$Xi 1041 mov \$3,%ecx 1042 jmp .Loop_16_xx_avx 1043.align 32 1044.Loop_16_xx_avx: 1045___ 1046for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } 1047$code.=<<___; 1048 dec %ecx 1049 jnz .Loop_16_xx_avx 1050 1051 mov \$1,%ecx 1052 lea K256+128(%rip),$Tbl 1053___ 1054for($i=0;$i<4;$i++) { 1055 $code.=<<___; 1056 cmp `4*$i`(%rbx),%ecx # examine counters 1057 cmovge $Tbl,@ptr[$i] # cancel input 1058___ 1059} 1060$code.=<<___; 1061 vmovdqa (%rbx),$sigma # pull counters 1062 vpxor $t1,$t1,$t1 1063 vmovdqa $sigma,$Xn 1064 vpcmpgtd $t1,$Xn,$Xn # mask value 1065 vpaddd $Xn,$sigma,$sigma # counters-- 1066 1067 vmovdqu 0x00-0x80($ctx),$t1 1068 vpand $Xn,$A,$A 1069 vmovdqu 0x20-0x80($ctx),$t2 1070 vpand $Xn,$B,$B 1071 vmovdqu 0x40-0x80($ctx),$t3 1072 vpand $Xn,$C,$C 1073 vmovdqu 0x60-0x80($ctx),$Xi 1074 vpand $Xn,$D,$D 1075 vpaddd $t1,$A,$A 1076 vmovdqu 0x80-0x80($ctx),$t1 1077 vpand $Xn,$E,$E 1078 vpaddd $t2,$B,$B 1079 vmovdqu 0xa0-0x80($ctx),$t2 1080 vpand $Xn,$F,$F 1081 vpaddd $t3,$C,$C 1082 vmovdqu 0xc0-0x80($ctx),$t3 1083 vpand $Xn,$G,$G 1084 vpaddd $Xi,$D,$D 1085 vmovdqu 0xe0-0x80($ctx),$Xi 1086 vpand $Xn,$H,$H 1087 vpaddd $t1,$E,$E 1088 vpaddd $t2,$F,$F 1089 vmovdqu $A,0x00-0x80($ctx) 1090 vpaddd $t3,$G,$G 1091 vmovdqu $B,0x20-0x80($ctx) 1092 vpaddd $Xi,$H,$H 1093 vmovdqu $C,0x40-0x80($ctx) 1094 vmovdqu $D,0x60-0x80($ctx) 1095 vmovdqu $E,0x80-0x80($ctx) 1096 vmovdqu $F,0xa0-0x80($ctx) 1097 vmovdqu $G,0xc0-0x80($ctx) 1098 vmovdqu $H,0xe0-0x80($ctx) 1099 1100 vmovdqu $sigma,(%rbx) # save counters 1101 vmovdqu .Lpbswap(%rip),$Xn 1102 dec $num 1103 jnz .Loop_avx 1104 1105 mov `$REG_SZ*17+8`(%rsp),$num 1106 lea $REG_SZ($ctx),$ctx 1107 lea `$inp_elm_size*$REG_SZ/4`($inp),$inp 1108 dec $num 1109 jnz .Loop_grande_avx 1110 1111.Ldone_avx: 1112 mov `$REG_SZ*17`(%rsp),%rax # original %rsp 1113.cfi_def_cfa %rax,8 1114 vzeroupper 1115___ 1116$code.=<<___ if ($win64); 1117 movaps -0xb8(%rax),%xmm6 1118 movaps -0xa8(%rax),%xmm7 1119 movaps -0x98(%rax),%xmm8 1120 movaps -0x88(%rax),%xmm9 1121 movaps -0x78(%rax),%xmm10 1122 movaps -0x68(%rax),%xmm11 1123 movaps -0x58(%rax),%xmm12 1124 movaps -0x48(%rax),%xmm13 1125 movaps -0x38(%rax),%xmm14 1126 movaps -0x28(%rax),%xmm15 1127___ 1128$code.=<<___; 1129 mov -16(%rax),%rbp 1130.cfi_restore %rbp 1131 mov -8(%rax),%rbx 1132.cfi_restore %rbx 1133 lea (%rax),%rsp 1134.cfi_def_cfa_register %rsp 1135.Lepilogue_avx: 1136 ret 1137.cfi_endproc 1138.size sha256_multi_block_avx,.-sha256_multi_block_avx 1139___ 1140 if ($avx>1) { 1141$code =~ s/\`([^\`]*)\`/eval $1/gem; 1142 1143$REG_SZ=32; 1144@ptr=map("%r$_",(12..15,8..11)); 1145 1146@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15)); 1147($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7)); 1148 1149$code.=<<___; 1150.type sha256_multi_block_avx2,\@function,3 1151.align 32 1152sha256_multi_block_avx2: 1153.cfi_startproc 1154_avx2_shortcut: 1155 mov %rsp,%rax 1156.cfi_def_cfa_register %rax 1157 push %rbx 1158.cfi_push %rbx 1159 push %rbp 1160.cfi_push %rbp 1161 push %r12 1162.cfi_push %r12 1163 push %r13 1164.cfi_push %r13 1165 push %r14 1166.cfi_push %r14 1167 push %r15 1168.cfi_push %r15 1169___ 1170$code.=<<___ if ($win64); 1171 lea -0xa8(%rsp),%rsp 1172 movaps %xmm6,(%rsp) 1173 movaps %xmm7,0x10(%rsp) 1174 movaps %xmm8,0x20(%rsp) 1175 movaps %xmm9,0x30(%rsp) 1176 movaps %xmm10,0x40(%rsp) 1177 movaps %xmm11,0x50(%rsp) 1178 movaps %xmm12,-0x78(%rax) 1179 movaps %xmm13,-0x68(%rax) 1180 movaps %xmm14,-0x58(%rax) 1181 movaps %xmm15,-0x48(%rax) 1182___ 1183$code.=<<___; 1184 sub \$`$REG_SZ*18`, %rsp 1185 and \$-256,%rsp 1186 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp 1187.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 1188.Lbody_avx2: 1189 lea K256+128(%rip),$Tbl 1190 lea 0x80($ctx),$ctx # size optimization 1191 1192.Loop_grande_avx2: 1193 mov $num,`$REG_SZ*17+8`(%rsp) # original $num 1194 xor $num,$num 1195 lea `$REG_SZ*16`(%rsp),%rbx 1196___ 1197for($i=0;$i<8;$i++) { 1198 $ptr_reg=&pointer_register($flavour,@ptr[$i]); 1199 $code.=<<___; 1200 # input pointer 1201 mov `$inp_elm_size*$i+0`($inp),$ptr_reg 1202 # number of blocks 1203 mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx 1204 cmp $num,%ecx 1205 cmovg %ecx,$num # find maximum 1206 test %ecx,%ecx 1207 mov %ecx,`4*$i`(%rbx) # initialize counters 1208 cmovle $Tbl,@ptr[$i] # cancel input 1209___ 1210} 1211$code.=<<___; 1212 vmovdqu 0x00-0x80($ctx),$A # load context 1213 lea 128(%rsp),%rax 1214 vmovdqu 0x20-0x80($ctx),$B 1215 lea 256+128(%rsp),%rbx 1216 vmovdqu 0x40-0x80($ctx),$C 1217 vmovdqu 0x60-0x80($ctx),$D 1218 vmovdqu 0x80-0x80($ctx),$E 1219 vmovdqu 0xa0-0x80($ctx),$F 1220 vmovdqu 0xc0-0x80($ctx),$G 1221 vmovdqu 0xe0-0x80($ctx),$H 1222 vmovdqu .Lpbswap(%rip),$Xn 1223 jmp .Loop_avx2 1224 1225.align 32 1226.Loop_avx2: 1227 vpxor $B,$C,$bxc # magic seed 1228___ 1229for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } 1230$code.=<<___; 1231 vmovdqu `&Xi_off($i)`,$Xi 1232 mov \$3,%ecx 1233 jmp .Loop_16_xx_avx2 1234.align 32 1235.Loop_16_xx_avx2: 1236___ 1237for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } 1238$code.=<<___; 1239 dec %ecx 1240 jnz .Loop_16_xx_avx2 1241 1242 mov \$1,%ecx 1243 lea `$REG_SZ*16`(%rsp),%rbx 1244 lea K256+128(%rip),$Tbl 1245___ 1246for($i=0;$i<8;$i++) { 1247 $code.=<<___; 1248 cmp `4*$i`(%rbx),%ecx # examine counters 1249 cmovge $Tbl,@ptr[$i] # cancel input 1250___ 1251} 1252$code.=<<___; 1253 vmovdqa (%rbx),$sigma # pull counters 1254 vpxor $t1,$t1,$t1 1255 vmovdqa $sigma,$Xn 1256 vpcmpgtd $t1,$Xn,$Xn # mask value 1257 vpaddd $Xn,$sigma,$sigma # counters-- 1258 1259 vmovdqu 0x00-0x80($ctx),$t1 1260 vpand $Xn,$A,$A 1261 vmovdqu 0x20-0x80($ctx),$t2 1262 vpand $Xn,$B,$B 1263 vmovdqu 0x40-0x80($ctx),$t3 1264 vpand $Xn,$C,$C 1265 vmovdqu 0x60-0x80($ctx),$Xi 1266 vpand $Xn,$D,$D 1267 vpaddd $t1,$A,$A 1268 vmovdqu 0x80-0x80($ctx),$t1 1269 vpand $Xn,$E,$E 1270 vpaddd $t2,$B,$B 1271 vmovdqu 0xa0-0x80($ctx),$t2 1272 vpand $Xn,$F,$F 1273 vpaddd $t3,$C,$C 1274 vmovdqu 0xc0-0x80($ctx),$t3 1275 vpand $Xn,$G,$G 1276 vpaddd $Xi,$D,$D 1277 vmovdqu 0xe0-0x80($ctx),$Xi 1278 vpand $Xn,$H,$H 1279 vpaddd $t1,$E,$E 1280 vpaddd $t2,$F,$F 1281 vmovdqu $A,0x00-0x80($ctx) 1282 vpaddd $t3,$G,$G 1283 vmovdqu $B,0x20-0x80($ctx) 1284 vpaddd $Xi,$H,$H 1285 vmovdqu $C,0x40-0x80($ctx) 1286 vmovdqu $D,0x60-0x80($ctx) 1287 vmovdqu $E,0x80-0x80($ctx) 1288 vmovdqu $F,0xa0-0x80($ctx) 1289 vmovdqu $G,0xc0-0x80($ctx) 1290 vmovdqu $H,0xe0-0x80($ctx) 1291 1292 vmovdqu $sigma,(%rbx) # save counters 1293 lea 256+128(%rsp),%rbx 1294 vmovdqu .Lpbswap(%rip),$Xn 1295 dec $num 1296 jnz .Loop_avx2 1297 1298 #mov `$REG_SZ*17+8`(%rsp),$num 1299 #lea $REG_SZ($ctx),$ctx 1300 #lea `$inp_elm_size*$REG_SZ/4`($inp),$inp 1301 #dec $num 1302 #jnz .Loop_grande_avx2 1303 1304.Ldone_avx2: 1305 mov `$REG_SZ*17`(%rsp),%rax # original %rsp 1306.cfi_def_cfa %rax,8 1307 vzeroupper 1308___ 1309$code.=<<___ if ($win64); 1310 movaps -0xd8(%rax),%xmm6 1311 movaps -0xc8(%rax),%xmm7 1312 movaps -0xb8(%rax),%xmm8 1313 movaps -0xa8(%rax),%xmm9 1314 movaps -0x98(%rax),%xmm10 1315 movaps -0x88(%rax),%xmm11 1316 movaps -0x78(%rax),%xmm12 1317 movaps -0x68(%rax),%xmm13 1318 movaps -0x58(%rax),%xmm14 1319 movaps -0x48(%rax),%xmm15 1320___ 1321$code.=<<___; 1322 mov -48(%rax),%r15 1323.cfi_restore %r15 1324 mov -40(%rax),%r14 1325.cfi_restore %r14 1326 mov -32(%rax),%r13 1327.cfi_restore %r13 1328 mov -24(%rax),%r12 1329.cfi_restore %r12 1330 mov -16(%rax),%rbp 1331.cfi_restore %rbp 1332 mov -8(%rax),%rbx 1333.cfi_restore %rbx 1334 lea (%rax),%rsp 1335.cfi_def_cfa_register %rsp 1336.Lepilogue_avx2: 1337 ret 1338.cfi_endproc 1339.size sha256_multi_block_avx2,.-sha256_multi_block_avx2 1340___ 1341 } }}} 1342$code.=<<___; 1343.align 256 1344K256: 1345___ 1346sub TABLE { 1347 foreach (@_) { 1348 $code.=<<___; 1349 .long $_,$_,$_,$_ 1350 .long $_,$_,$_,$_ 1351___ 1352 } 1353} 1354&TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, 1355 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 1356 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, 1357 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 1358 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, 1359 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 1360 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, 1361 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 1362 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, 1363 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 1364 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, 1365 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 1366 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, 1367 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 1368 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, 1369 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); 1370$code.=<<___; 1371.Lpbswap: 1372 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1373 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap 1374K256_shaext: 1375 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 1376 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 1377 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 1378 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 1379 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 1380 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 1381 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 1382 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 1383 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 1384 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 1385 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 1386 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 1387 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 1388 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 1389 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 1390 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 1391 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1392___ 1393 1394if ($win64) { 1395# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1396# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1397$rec="%rcx"; 1398$frame="%rdx"; 1399$context="%r8"; 1400$disp="%r9"; 1401 1402$code.=<<___; 1403.extern __imp_RtlVirtualUnwind 1404.type se_handler,\@abi-omnipotent 1405.align 16 1406se_handler: 1407 push %rsi 1408 push %rdi 1409 push %rbx 1410 push %rbp 1411 push %r12 1412 push %r13 1413 push %r14 1414 push %r15 1415 pushfq 1416 sub \$64,%rsp 1417 1418 mov 120($context),%rax # pull context->Rax 1419 mov 248($context),%rbx # pull context->Rip 1420 1421 mov 8($disp),%rsi # disp->ImageBase 1422 mov 56($disp),%r11 # disp->HandlerData 1423 1424 mov 0(%r11),%r10d # HandlerData[0] 1425 lea (%rsi,%r10),%r10 # end of prologue label 1426 cmp %r10,%rbx # context->Rip<.Lbody 1427 jb .Lin_prologue 1428 1429 mov 152($context),%rax # pull context->Rsp 1430 1431 mov 4(%r11),%r10d # HandlerData[1] 1432 lea (%rsi,%r10),%r10 # epilogue label 1433 cmp %r10,%rbx # context->Rip>=.Lepilogue 1434 jae .Lin_prologue 1435 1436 mov `16*17`(%rax),%rax # pull saved stack pointer 1437 1438 mov -8(%rax),%rbx 1439 mov -16(%rax),%rbp 1440 mov %rbx,144($context) # restore context->Rbx 1441 mov %rbp,160($context) # restore context->Rbp 1442 1443 lea -24-10*16(%rax),%rsi 1444 lea 512($context),%rdi # &context.Xmm6 1445 mov \$20,%ecx 1446 .long 0xa548f3fc # cld; rep movsq 1447 1448.Lin_prologue: 1449 mov 8(%rax),%rdi 1450 mov 16(%rax),%rsi 1451 mov %rax,152($context) # restore context->Rsp 1452 mov %rsi,168($context) # restore context->Rsi 1453 mov %rdi,176($context) # restore context->Rdi 1454 1455 mov 40($disp),%rdi # disp->ContextRecord 1456 mov $context,%rsi # context 1457 mov \$154,%ecx # sizeof(CONTEXT) 1458 .long 0xa548f3fc # cld; rep movsq 1459 1460 mov $disp,%rsi 1461 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1462 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1463 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1464 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1465 mov 40(%rsi),%r10 # disp->ContextRecord 1466 lea 56(%rsi),%r11 # &disp->HandlerData 1467 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1468 mov %r10,32(%rsp) # arg5 1469 mov %r11,40(%rsp) # arg6 1470 mov %r12,48(%rsp) # arg7 1471 mov %rcx,56(%rsp) # arg8, (NULL) 1472 call *__imp_RtlVirtualUnwind(%rip) 1473 1474 mov \$1,%eax # ExceptionContinueSearch 1475 add \$64,%rsp 1476 popfq 1477 pop %r15 1478 pop %r14 1479 pop %r13 1480 pop %r12 1481 pop %rbp 1482 pop %rbx 1483 pop %rdi 1484 pop %rsi 1485 ret 1486.size se_handler,.-se_handler 1487___ 1488$code.=<<___ if ($avx>1); 1489.type avx2_handler,\@abi-omnipotent 1490.align 16 1491avx2_handler: 1492 push %rsi 1493 push %rdi 1494 push %rbx 1495 push %rbp 1496 push %r12 1497 push %r13 1498 push %r14 1499 push %r15 1500 pushfq 1501 sub \$64,%rsp 1502 1503 mov 120($context),%rax # pull context->Rax 1504 mov 248($context),%rbx # pull context->Rip 1505 1506 mov 8($disp),%rsi # disp->ImageBase 1507 mov 56($disp),%r11 # disp->HandlerData 1508 1509 mov 0(%r11),%r10d # HandlerData[0] 1510 lea (%rsi,%r10),%r10 # end of prologue label 1511 cmp %r10,%rbx # context->Rip<body label 1512 jb .Lin_prologue 1513 1514 mov 152($context),%rax # pull context->Rsp 1515 1516 mov 4(%r11),%r10d # HandlerData[1] 1517 lea (%rsi,%r10),%r10 # epilogue label 1518 cmp %r10,%rbx # context->Rip>=epilogue label 1519 jae .Lin_prologue 1520 1521 mov `32*17`($context),%rax # pull saved stack pointer 1522 1523 mov -8(%rax),%rbx 1524 mov -16(%rax),%rbp 1525 mov -24(%rax),%r12 1526 mov -32(%rax),%r13 1527 mov -40(%rax),%r14 1528 mov -48(%rax),%r15 1529 mov %rbx,144($context) # restore context->Rbx 1530 mov %rbp,160($context) # restore context->Rbp 1531 mov %r12,216($context) # restore context->R12 1532 mov %r13,224($context) # restore context->R13 1533 mov %r14,232($context) # restore context->R14 1534 mov %r15,240($context) # restore context->R15 1535 1536 lea -56-10*16(%rax),%rsi 1537 lea 512($context),%rdi # &context.Xmm6 1538 mov \$20,%ecx 1539 .long 0xa548f3fc # cld; rep movsq 1540 1541 jmp .Lin_prologue 1542.size avx2_handler,.-avx2_handler 1543___ 1544$code.=<<___; 1545.section .pdata 1546.align 4 1547 .rva .LSEH_begin_sha256_multi_block 1548 .rva .LSEH_end_sha256_multi_block 1549 .rva .LSEH_info_sha256_multi_block 1550 .rva .LSEH_begin_sha256_multi_block_shaext 1551 .rva .LSEH_end_sha256_multi_block_shaext 1552 .rva .LSEH_info_sha256_multi_block_shaext 1553___ 1554$code.=<<___ if ($avx); 1555 .rva .LSEH_begin_sha256_multi_block_avx 1556 .rva .LSEH_end_sha256_multi_block_avx 1557 .rva .LSEH_info_sha256_multi_block_avx 1558___ 1559$code.=<<___ if ($avx>1); 1560 .rva .LSEH_begin_sha256_multi_block_avx2 1561 .rva .LSEH_end_sha256_multi_block_avx2 1562 .rva .LSEH_info_sha256_multi_block_avx2 1563___ 1564$code.=<<___; 1565.section .xdata 1566.align 8 1567.LSEH_info_sha256_multi_block: 1568 .byte 9,0,0,0 1569 .rva se_handler 1570 .rva .Lbody,.Lepilogue # HandlerData[] 1571.LSEH_info_sha256_multi_block_shaext: 1572 .byte 9,0,0,0 1573 .rva se_handler 1574 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] 1575___ 1576$code.=<<___ if ($avx); 1577.LSEH_info_sha256_multi_block_avx: 1578 .byte 9,0,0,0 1579 .rva se_handler 1580 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] 1581___ 1582$code.=<<___ if ($avx>1); 1583.LSEH_info_sha256_multi_block_avx2: 1584 .byte 9,0,0,0 1585 .rva avx2_handler 1586 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] 1587___ 1588} 1589#################################################################### 1590 1591sub rex { 1592 local *opcode=shift; 1593 my ($dst,$src)=@_; 1594 my $rex=0; 1595 1596 $rex|=0x04 if ($dst>=8); 1597 $rex|=0x01 if ($src>=8); 1598 unshift @opcode,$rex|0x40 if ($rex); 1599} 1600 1601sub sha256op38 { 1602 my $instr = shift; 1603 my %opcodelet = ( 1604 "sha256rnds2" => 0xcb, 1605 "sha256msg1" => 0xcc, 1606 "sha256msg2" => 0xcd ); 1607 1608 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1609 my @opcode=(0x0f,0x38); 1610 rex(\@opcode,$2,$1); 1611 push @opcode,$opcodelet{$instr}; 1612 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1613 return ".byte\t".join(',',@opcode); 1614 } else { 1615 return $instr."\t".@_[0]; 1616 } 1617} 1618 1619foreach (split("\n",$code)) { 1620 s/\`([^\`]*)\`/eval($1)/ge; 1621 1622 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or 1623 1624 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1625 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1626 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or 1627 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1628 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or 1629 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1630 1631 print $_,"\n"; 1632} 1633 1634close STDOUT or die "error closing STDOUT: $!"; 1635