1#! /usr/bin/env perl 2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# March, June 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that 21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH 22# function features so called "528B" variant utilizing additional 23# 256+16 bytes of per-key storage [+512 bytes shared table]. 24# Performance results are for this streamed GHASH subroutine and are 25# expressed in cycles per processed byte, less is better: 26# 27# gcc 3.4.x(*) assembler 28# 29# P4 28.6 14.0 +100% 30# Opteron 19.3 7.7 +150% 31# Core2 17.8 8.1(**) +120% 32# Atom 31.6 16.8 +88% 33# VIA Nano 21.8 10.1 +115% 34# 35# (*) comparison is not completely fair, because C results are 36# for vanilla "256B" implementation, while assembler results 37# are for "528B";-) 38# (**) it's mystery [to me] why Core2 result is not same as for 39# Opteron; 40 41# May 2010 42# 43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. 44# See ghash-x86.pl for background information and details about coding 45# techniques. 46# 47# Special thanks to David Woodhouse for providing access to a 48# Westmere-based system on behalf of Intel Open Source Technology Centre. 49 50# December 2012 51# 52# Overhaul: aggregate Karatsuba post-processing, improve ILP in 53# reduction_alg9, increase reduction aggregate factor to 4x. As for 54# the latter. ghash-x86.pl discusses that it makes lesser sense to 55# increase aggregate factor. Then why increase here? Critical path 56# consists of 3 independent pclmulqdq instructions, Karatsuba post- 57# processing and reduction. "On top" of this we lay down aggregated 58# multiplication operations, triplets of independent pclmulqdq's. As 59# issue rate for pclmulqdq is limited, it makes lesser sense to 60# aggregate more multiplications than it takes to perform remaining 61# non-multiplication operations. 2x is near-optimal coefficient for 62# contemporary Intel CPUs (therefore modest improvement coefficient), 63# but not for Bulldozer. Latter is because logical SIMD operations 64# are twice as slow in comparison to Intel, so that critical path is 65# longer. A CPU with higher pclmulqdq issue rate would also benefit 66# from higher aggregate factor... 67# 68# Westmere 1.78(+13%) 69# Sandy Bridge 1.80(+8%) 70# Ivy Bridge 1.80(+7%) 71# Haswell 0.55(+93%) (if system doesn't support AVX) 72# Broadwell 0.45(+110%)(if system doesn't support AVX) 73# Skylake 0.44(+110%)(if system doesn't support AVX) 74# Bulldozer 1.49(+27%) 75# Silvermont 2.88(+13%) 76# Knights L 2.12(-) (if system doesn't support AVX) 77# Goldmont 1.08(+24%) 78 79# March 2013 80# 81# ... 8x aggregate factor AVX code path is using reduction algorithm 82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable 83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs 84# sub-optimally in comparison to above mentioned version. But thanks 85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that 86# it performs in 0.41 cycles per byte on Haswell processor, in 87# 0.29 on Broadwell, and in 0.36 on Skylake. 88# 89# Knights Landing achieves 1.09 cpb. 90# 91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 92 93$flavour = shift; 94$output = shift; 95if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 96 97$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 98 99$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 100( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 101( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 102die "can't locate x86_64-xlate.pl"; 103 104if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 105 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 106 $avx = ($1>=2.20) + ($1>=2.22); 107} 108 109if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 110 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 111 $avx = ($1>=2.09) + ($1>=2.10); 112} 113 114if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 115 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 116 $avx = ($1>=10) + ($1>=11); 117} 118 119if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 120 $avx = ($2>=3.0) + ($2>3.0); 121} 122 123open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 124*STDOUT=*OUT; 125 126$do4xaggr=1; 127 128# common register layout 129$nlo="%rax"; 130$nhi="%rbx"; 131$Zlo="%r8"; 132$Zhi="%r9"; 133$tmp="%r10"; 134$rem_4bit = "%r11"; 135 136$Xi="%rdi"; 137$Htbl="%rsi"; 138 139# per-function register layout 140$cnt="%rcx"; 141$rem="%rdx"; 142 143sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or 144 $r =~ s/%[er]([sd]i)/%\1l/ or 145 $r =~ s/%[er](bp)/%\1l/ or 146 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 147 148sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 149{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 150 my $arg = pop; 151 $arg = "\$$arg" if ($arg*1 eq $arg); 152 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 153} 154 155{ my $N; 156 sub loop() { 157 my $inp = shift; 158 159 $N++; 160$code.=<<___; 161 xor $nlo,$nlo 162 xor $nhi,$nhi 163 mov `&LB("$Zlo")`,`&LB("$nlo")` 164 mov `&LB("$Zlo")`,`&LB("$nhi")` 165 shl \$4,`&LB("$nlo")` 166 mov \$14,$cnt 167 mov 8($Htbl,$nlo),$Zlo 168 mov ($Htbl,$nlo),$Zhi 169 and \$0xf0,`&LB("$nhi")` 170 mov $Zlo,$rem 171 jmp .Loop$N 172 173.align 16 174.Loop$N: 175 shr \$4,$Zlo 176 and \$0xf,$rem 177 mov $Zhi,$tmp 178 mov ($inp,$cnt),`&LB("$nlo")` 179 shr \$4,$Zhi 180 xor 8($Htbl,$nhi),$Zlo 181 shl \$60,$tmp 182 xor ($Htbl,$nhi),$Zhi 183 mov `&LB("$nlo")`,`&LB("$nhi")` 184 xor ($rem_4bit,$rem,8),$Zhi 185 mov $Zlo,$rem 186 shl \$4,`&LB("$nlo")` 187 xor $tmp,$Zlo 188 dec $cnt 189 js .Lbreak$N 190 191 shr \$4,$Zlo 192 and \$0xf,$rem 193 mov $Zhi,$tmp 194 shr \$4,$Zhi 195 xor 8($Htbl,$nlo),$Zlo 196 shl \$60,$tmp 197 xor ($Htbl,$nlo),$Zhi 198 and \$0xf0,`&LB("$nhi")` 199 xor ($rem_4bit,$rem,8),$Zhi 200 mov $Zlo,$rem 201 xor $tmp,$Zlo 202 jmp .Loop$N 203 204.align 16 205.Lbreak$N: 206 shr \$4,$Zlo 207 and \$0xf,$rem 208 mov $Zhi,$tmp 209 shr \$4,$Zhi 210 xor 8($Htbl,$nlo),$Zlo 211 shl \$60,$tmp 212 xor ($Htbl,$nlo),$Zhi 213 and \$0xf0,`&LB("$nhi")` 214 xor ($rem_4bit,$rem,8),$Zhi 215 mov $Zlo,$rem 216 xor $tmp,$Zlo 217 218 shr \$4,$Zlo 219 and \$0xf,$rem 220 mov $Zhi,$tmp 221 shr \$4,$Zhi 222 xor 8($Htbl,$nhi),$Zlo 223 shl \$60,$tmp 224 xor ($Htbl,$nhi),$Zhi 225 xor $tmp,$Zlo 226 xor ($rem_4bit,$rem,8),$Zhi 227 228 bswap $Zlo 229 bswap $Zhi 230___ 231}} 232 233$code=<<___; 234.text 235.extern OPENSSL_ia32cap_P 236 237.globl gcm_gmult_4bit 238.type gcm_gmult_4bit,\@function,2 239.align 16 240gcm_gmult_4bit: 241.cfi_startproc 242 push %rbx 243.cfi_push %rbx 244 push %rbp # %rbp and others are pushed exclusively in 245.cfi_push %rbp 246 push %r12 # order to reuse Win64 exception handler... 247.cfi_push %r12 248 push %r13 249.cfi_push %r13 250 push %r14 251.cfi_push %r14 252 push %r15 253.cfi_push %r15 254 sub \$280,%rsp 255.cfi_adjust_cfa_offset 280 256.Lgmult_prologue: 257 258 movzb 15($Xi),$Zlo 259 lea .Lrem_4bit(%rip),$rem_4bit 260___ 261 &loop ($Xi); 262$code.=<<___; 263 mov $Zlo,8($Xi) 264 mov $Zhi,($Xi) 265 266 lea 280+48(%rsp),%rsi 267.cfi_def_cfa %rsi,8 268 mov -8(%rsi),%rbx 269.cfi_restore %rbx 270 lea (%rsi),%rsp 271.cfi_def_cfa_register %rsp 272.Lgmult_epilogue: 273 ret 274.cfi_endproc 275.size gcm_gmult_4bit,.-gcm_gmult_4bit 276___ 277 278# per-function register layout 279$inp="%rdx"; 280$len="%rcx"; 281$rem_8bit=$rem_4bit; 282 283$code.=<<___; 284.globl gcm_ghash_4bit 285.type gcm_ghash_4bit,\@function,4 286.align 16 287gcm_ghash_4bit: 288.cfi_startproc 289 push %rbx 290.cfi_push %rbx 291 push %rbp 292.cfi_push %rbp 293 push %r12 294.cfi_push %r12 295 push %r13 296.cfi_push %r13 297 push %r14 298.cfi_push %r14 299 push %r15 300.cfi_push %r15 301 sub \$280,%rsp 302.cfi_adjust_cfa_offset 280 303.Lghash_prologue: 304 mov $inp,%r14 # reassign couple of args 305 mov $len,%r15 306___ 307{ my $inp="%r14"; 308 my $dat="%edx"; 309 my $len="%r15"; 310 my @nhi=("%ebx","%ecx"); 311 my @rem=("%r12","%r13"); 312 my $Hshr4="%rbp"; 313 314 &sub ($Htbl,-128); # size optimization 315 &lea ($Hshr4,"16+128(%rsp)"); 316 { my @lo =($nlo,$nhi); 317 my @hi =($Zlo,$Zhi); 318 319 &xor ($dat,$dat); 320 for ($i=0,$j=-2;$i<18;$i++,$j++) { 321 &mov ("$j(%rsp)",&LB($dat)) if ($i>1); 322 &or ($lo[0],$tmp) if ($i>1); 323 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); 324 &shr ($lo[1],4) if ($i>0 && $i<17); 325 &mov ($tmp,$hi[1]) if ($i>0 && $i<17); 326 &shr ($hi[1],4) if ($i>0 && $i<17); 327 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); 328 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); 329 &shl (&LB($dat),4) if ($i>0 && $i<17); 330 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); 331 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); 332 &shl ($tmp,60) if ($i>0 && $i<17); 333 334 push (@lo,shift(@lo)); 335 push (@hi,shift(@hi)); 336 } 337 } 338 &add ($Htbl,-128); 339 &mov ($Zlo,"8($Xi)"); 340 &mov ($Zhi,"0($Xi)"); 341 &add ($len,$inp); # pointer to the end of data 342 &lea ($rem_8bit,".Lrem_8bit(%rip)"); 343 &jmp (".Louter_loop"); 344 345$code.=".align 16\n.Louter_loop:\n"; 346 &xor ($Zhi,"($inp)"); 347 &mov ("%rdx","8($inp)"); 348 &lea ($inp,"16($inp)"); 349 &xor ("%rdx",$Zlo); 350 &mov ("($Xi)",$Zhi); 351 &mov ("8($Xi)","%rdx"); 352 &shr ("%rdx",32); 353 354 &xor ($nlo,$nlo); 355 &rol ($dat,8); 356 &mov (&LB($nlo),&LB($dat)); 357 &movz ($nhi[0],&LB($dat)); 358 &shl (&LB($nlo),4); 359 &shr ($nhi[0],4); 360 361 for ($j=11,$i=0;$i<15;$i++) { 362 &rol ($dat,8); 363 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); 364 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); 365 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); 366 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); 367 368 &mov (&LB($nlo),&LB($dat)); 369 &xor ($Zlo,$tmp) if ($i>0); 370 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); 371 372 &movz ($nhi[1],&LB($dat)); 373 &shl (&LB($nlo),4); 374 &movzb ($rem[0],"(%rsp,$nhi[0])"); 375 376 &shr ($nhi[1],4) if ($i<14); 377 &and ($nhi[1],0xf0) if ($i==14); 378 &shl ($rem[1],48) if ($i>0); 379 &xor ($rem[0],$Zlo); 380 381 &mov ($tmp,$Zhi); 382 &xor ($Zhi,$rem[1]) if ($i>0); 383 &shr ($Zlo,8); 384 385 &movz ($rem[0],&LB($rem[0])); 386 &mov ($dat,"$j($Xi)") if (--$j%4==0); 387 &shr ($Zhi,8); 388 389 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); 390 &shl ($tmp,56); 391 &xor ($Zhi,"($Hshr4,$nhi[0],8)"); 392 393 unshift (@nhi,pop(@nhi)); # "rotate" registers 394 unshift (@rem,pop(@rem)); 395 } 396 &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); 397 &xor ($Zlo,"8($Htbl,$nlo)"); 398 &xor ($Zhi,"($Htbl,$nlo)"); 399 400 &shl ($rem[1],48); 401 &xor ($Zlo,$tmp); 402 403 &xor ($Zhi,$rem[1]); 404 &movz ($rem[0],&LB($Zlo)); 405 &shr ($Zlo,4); 406 407 &mov ($tmp,$Zhi); 408 &shl (&LB($rem[0]),4); 409 &shr ($Zhi,4); 410 411 &xor ($Zlo,"8($Htbl,$nhi[0])"); 412 &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); 413 &shl ($tmp,60); 414 415 &xor ($Zhi,"($Htbl,$nhi[0])"); 416 &xor ($Zlo,$tmp); 417 &shl ($rem[0],48); 418 419 &bswap ($Zlo); 420 &xor ($Zhi,$rem[0]); 421 422 &bswap ($Zhi); 423 &cmp ($inp,$len); 424 &jb (".Louter_loop"); 425} 426$code.=<<___; 427 mov $Zlo,8($Xi) 428 mov $Zhi,($Xi) 429 430 lea 280+48(%rsp),%rsi 431.cfi_def_cfa %rsi,8 432 mov -48(%rsi),%r15 433.cfi_restore %r15 434 mov -40(%rsi),%r14 435.cfi_restore %r14 436 mov -32(%rsi),%r13 437.cfi_restore %r13 438 mov -24(%rsi),%r12 439.cfi_restore %r12 440 mov -16(%rsi),%rbp 441.cfi_restore %rbp 442 mov -8(%rsi),%rbx 443.cfi_restore %rbx 444 lea 0(%rsi),%rsp 445.cfi_def_cfa_register %rsp 446.Lghash_epilogue: 447 ret 448.cfi_endproc 449.size gcm_ghash_4bit,.-gcm_ghash_4bit 450___ 451 452###################################################################### 453# PCLMULQDQ version. 454 455@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 456 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 457 458($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; 459($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); 460 461sub clmul64x64_T2 { # minimal register pressure 462my ($Xhi,$Xi,$Hkey,$HK)=@_; 463 464if (!defined($HK)) { $HK = $T2; 465$code.=<<___; 466 movdqa $Xi,$Xhi # 467 pshufd \$0b01001110,$Xi,$T1 468 pshufd \$0b01001110,$Hkey,$T2 469 pxor $Xi,$T1 # 470 pxor $Hkey,$T2 471___ 472} else { 473$code.=<<___; 474 movdqa $Xi,$Xhi # 475 pshufd \$0b01001110,$Xi,$T1 476 pxor $Xi,$T1 # 477___ 478} 479$code.=<<___; 480 pclmulqdq \$0x00,$Hkey,$Xi ####### 481 pclmulqdq \$0x11,$Hkey,$Xhi ####### 482 pclmulqdq \$0x00,$HK,$T1 ####### 483 pxor $Xi,$T1 # 484 pxor $Xhi,$T1 # 485 486 movdqa $T1,$T2 # 487 psrldq \$8,$T1 488 pslldq \$8,$T2 # 489 pxor $T1,$Xhi 490 pxor $T2,$Xi # 491___ 492} 493 494sub reduction_alg9 { # 17/11 times faster than Intel version 495my ($Xhi,$Xi) = @_; 496 497$code.=<<___; 498 # 1st phase 499 movdqa $Xi,$T2 # 500 movdqa $Xi,$T1 501 psllq \$5,$Xi 502 pxor $Xi,$T1 # 503 psllq \$1,$Xi 504 pxor $T1,$Xi # 505 psllq \$57,$Xi # 506 movdqa $Xi,$T1 # 507 pslldq \$8,$Xi 508 psrldq \$8,$T1 # 509 pxor $T2,$Xi 510 pxor $T1,$Xhi # 511 512 # 2nd phase 513 movdqa $Xi,$T2 514 psrlq \$1,$Xi 515 pxor $T2,$Xhi # 516 pxor $Xi,$T2 517 psrlq \$5,$Xi 518 pxor $T2,$Xi # 519 psrlq \$1,$Xi # 520 pxor $Xhi,$Xi # 521___ 522} 523 524{ my ($Htbl,$Xip)=@_4args; 525 my $HK="%xmm6"; 526 527$code.=<<___; 528.globl gcm_init_clmul 529.type gcm_init_clmul,\@abi-omnipotent 530.align 16 531gcm_init_clmul: 532.cfi_startproc 533.L_init_clmul: 534___ 535$code.=<<___ if ($win64); 536.LSEH_begin_gcm_init_clmul: 537 # I can't trust assembler to use specific encoding:-( 538 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 539 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 540___ 541$code.=<<___; 542 movdqu ($Xip),$Hkey 543 pshufd \$0b01001110,$Hkey,$Hkey # dword swap 544 545 # <<1 twist 546 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 547 movdqa $Hkey,$T1 548 psllq \$1,$Hkey 549 pxor $T3,$T3 # 550 psrlq \$63,$T1 551 pcmpgtd $T2,$T3 # broadcast carry bit 552 pslldq \$8,$T1 553 por $T1,$Hkey # H<<=1 554 555 # magic reduction 556 pand .L0x1c2_polynomial(%rip),$T3 557 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial 558 559 # calculate H^2 560 pshufd \$0b01001110,$Hkey,$HK 561 movdqa $Hkey,$Xi 562 pxor $Hkey,$HK 563___ 564 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); 565 &reduction_alg9 ($Xhi,$Xi); 566$code.=<<___; 567 pshufd \$0b01001110,$Hkey,$T1 568 pshufd \$0b01001110,$Xi,$T2 569 pxor $Hkey,$T1 # Karatsuba pre-processing 570 movdqu $Hkey,0x00($Htbl) # save H 571 pxor $Xi,$T2 # Karatsuba pre-processing 572 movdqu $Xi,0x10($Htbl) # save H^2 573 palignr \$8,$T1,$T2 # low part is H.lo^H.hi... 574 movdqu $T2,0x20($Htbl) # save Karatsuba "salt" 575___ 576if ($do4xaggr) { 577 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 578 &reduction_alg9 ($Xhi,$Xi); 579$code.=<<___; 580 movdqa $Xi,$T3 581___ 582 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 583 &reduction_alg9 ($Xhi,$Xi); 584$code.=<<___; 585 pshufd \$0b01001110,$T3,$T1 586 pshufd \$0b01001110,$Xi,$T2 587 pxor $T3,$T1 # Karatsuba pre-processing 588 movdqu $T3,0x30($Htbl) # save H^3 589 pxor $Xi,$T2 # Karatsuba pre-processing 590 movdqu $Xi,0x40($Htbl) # save H^4 591 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... 592 movdqu $T2,0x50($Htbl) # save Karatsuba "salt" 593___ 594} 595$code.=<<___ if ($win64); 596 movaps (%rsp),%xmm6 597 lea 0x18(%rsp),%rsp 598.LSEH_end_gcm_init_clmul: 599___ 600$code.=<<___; 601 ret 602.cfi_endproc 603.size gcm_init_clmul,.-gcm_init_clmul 604___ 605} 606 607{ my ($Xip,$Htbl)=@_4args; 608 609$code.=<<___; 610.globl gcm_gmult_clmul 611.type gcm_gmult_clmul,\@abi-omnipotent 612.align 16 613gcm_gmult_clmul: 614.cfi_startproc 615.L_gmult_clmul: 616 movdqu ($Xip),$Xi 617 movdqa .Lbswap_mask(%rip),$T3 618 movdqu ($Htbl),$Hkey 619 movdqu 0x20($Htbl),$T2 620 pshufb $T3,$Xi 621___ 622 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); 623$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); 624 # experimental alternative. special thing about is that there 625 # no dependency between the two multiplications... 626 mov \$`0xE1<<1`,%eax 627 mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff 628 mov \$0x07,%r11d 629 movq %rax,$T1 630 movq %r10,$T2 631 movq %r11,$T3 # borrow $T3 632 pand $Xi,$T3 633 pshufb $T3,$T2 # ($Xi&7)·0xE0 634 movq %rax,$T3 635 pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) 636 pxor $Xi,$T2 637 pslldq \$15,$T2 638 paddd $T2,$T2 # <<(64+56+1) 639 pxor $T2,$Xi 640 pclmulqdq \$0x01,$T3,$Xi 641 movdqa .Lbswap_mask(%rip),$T3 # reload $T3 642 psrldq \$1,$T1 643 pxor $T1,$Xhi 644 pslldq \$7,$Xi 645 pxor $Xhi,$Xi 646___ 647$code.=<<___; 648 pshufb $T3,$Xi 649 movdqu $Xi,($Xip) 650 ret 651.cfi_endproc 652.size gcm_gmult_clmul,.-gcm_gmult_clmul 653___ 654} 655 656{ my ($Xip,$Htbl,$inp,$len)=@_4args; 657 my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); 658 my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); 659 660$code.=<<___; 661.globl gcm_ghash_clmul 662.type gcm_ghash_clmul,\@abi-omnipotent 663.align 32 664gcm_ghash_clmul: 665.cfi_startproc 666.L_ghash_clmul: 667___ 668$code.=<<___ if ($win64); 669 lea -0x88(%rsp),%rax 670.LSEH_begin_gcm_ghash_clmul: 671 # I can't trust assembler to use specific encoding:-( 672 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 673 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 674 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 675 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 676 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 677 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 678 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 679 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 680 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 681 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 682 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 683___ 684$code.=<<___; 685 movdqa .Lbswap_mask(%rip),$T3 686 687 movdqu ($Xip),$Xi 688 movdqu ($Htbl),$Hkey 689 movdqu 0x20($Htbl),$HK 690 pshufb $T3,$Xi 691 692 sub \$0x10,$len 693 jz .Lodd_tail 694 695 movdqu 0x10($Htbl),$Hkey2 696___ 697if ($do4xaggr) { 698my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); 699 700$code.=<<___; 701 mov OPENSSL_ia32cap_P+4(%rip),%eax 702 cmp \$0x30,$len 703 jb .Lskip4x 704 705 and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE 706 cmp \$`1<<22`,%eax # check for MOVBE without XSAVE 707 je .Lskip4x 708 709 sub \$0x30,$len 710 mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff 711 movdqu 0x30($Htbl),$Hkey3 712 movdqu 0x40($Htbl),$Hkey4 713 714 ####### 715 # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P 716 # 717 movdqu 0x30($inp),$Xln 718 movdqu 0x20($inp),$Xl 719 pshufb $T3,$Xln 720 pshufb $T3,$Xl 721 movdqa $Xln,$Xhn 722 pshufd \$0b01001110,$Xln,$Xmn 723 pxor $Xln,$Xmn 724 pclmulqdq \$0x00,$Hkey,$Xln 725 pclmulqdq \$0x11,$Hkey,$Xhn 726 pclmulqdq \$0x00,$HK,$Xmn 727 728 movdqa $Xl,$Xh 729 pshufd \$0b01001110,$Xl,$Xm 730 pxor $Xl,$Xm 731 pclmulqdq \$0x00,$Hkey2,$Xl 732 pclmulqdq \$0x11,$Hkey2,$Xh 733 pclmulqdq \$0x10,$HK,$Xm 734 xorps $Xl,$Xln 735 xorps $Xh,$Xhn 736 movups 0x50($Htbl),$HK 737 xorps $Xm,$Xmn 738 739 movdqu 0x10($inp),$Xl 740 movdqu 0($inp),$T1 741 pshufb $T3,$Xl 742 pshufb $T3,$T1 743 movdqa $Xl,$Xh 744 pshufd \$0b01001110,$Xl,$Xm 745 pxor $T1,$Xi 746 pxor $Xl,$Xm 747 pclmulqdq \$0x00,$Hkey3,$Xl 748 movdqa $Xi,$Xhi 749 pshufd \$0b01001110,$Xi,$T1 750 pxor $Xi,$T1 751 pclmulqdq \$0x11,$Hkey3,$Xh 752 pclmulqdq \$0x00,$HK,$Xm 753 xorps $Xl,$Xln 754 xorps $Xh,$Xhn 755 756 lea 0x40($inp),$inp 757 sub \$0x40,$len 758 jc .Ltail4x 759 760 jmp .Lmod4_loop 761.align 32 762.Lmod4_loop: 763 pclmulqdq \$0x00,$Hkey4,$Xi 764 xorps $Xm,$Xmn 765 movdqu 0x30($inp),$Xl 766 pshufb $T3,$Xl 767 pclmulqdq \$0x11,$Hkey4,$Xhi 768 xorps $Xln,$Xi 769 movdqu 0x20($inp),$Xln 770 movdqa $Xl,$Xh 771 pclmulqdq \$0x10,$HK,$T1 772 pshufd \$0b01001110,$Xl,$Xm 773 xorps $Xhn,$Xhi 774 pxor $Xl,$Xm 775 pshufb $T3,$Xln 776 movups 0x20($Htbl),$HK 777 xorps $Xmn,$T1 778 pclmulqdq \$0x00,$Hkey,$Xl 779 pshufd \$0b01001110,$Xln,$Xmn 780 781 pxor $Xi,$T1 # aggregated Karatsuba post-processing 782 movdqa $Xln,$Xhn 783 pxor $Xhi,$T1 # 784 pxor $Xln,$Xmn 785 movdqa $T1,$T2 # 786 pclmulqdq \$0x11,$Hkey,$Xh 787 pslldq \$8,$T1 788 psrldq \$8,$T2 # 789 pxor $T1,$Xi 790 movdqa .L7_mask(%rip),$T1 791 pxor $T2,$Xhi # 792 movq %rax,$T2 793 794 pand $Xi,$T1 # 1st phase 795 pshufb $T1,$T2 # 796 pxor $Xi,$T2 # 797 pclmulqdq \$0x00,$HK,$Xm 798 psllq \$57,$T2 # 799 movdqa $T2,$T1 # 800 pslldq \$8,$T2 801 pclmulqdq \$0x00,$Hkey2,$Xln 802 psrldq \$8,$T1 # 803 pxor $T2,$Xi 804 pxor $T1,$Xhi # 805 movdqu 0($inp),$T1 806 807 movdqa $Xi,$T2 # 2nd phase 808 psrlq \$1,$Xi 809 pclmulqdq \$0x11,$Hkey2,$Xhn 810 xorps $Xl,$Xln 811 movdqu 0x10($inp),$Xl 812 pshufb $T3,$Xl 813 pclmulqdq \$0x10,$HK,$Xmn 814 xorps $Xh,$Xhn 815 movups 0x50($Htbl),$HK 816 pshufb $T3,$T1 817 pxor $T2,$Xhi # 818 pxor $Xi,$T2 819 psrlq \$5,$Xi 820 821 movdqa $Xl,$Xh 822 pxor $Xm,$Xmn 823 pshufd \$0b01001110,$Xl,$Xm 824 pxor $T2,$Xi # 825 pxor $T1,$Xhi 826 pxor $Xl,$Xm 827 pclmulqdq \$0x00,$Hkey3,$Xl 828 psrlq \$1,$Xi # 829 pxor $Xhi,$Xi # 830 movdqa $Xi,$Xhi 831 pclmulqdq \$0x11,$Hkey3,$Xh 832 xorps $Xl,$Xln 833 pshufd \$0b01001110,$Xi,$T1 834 pxor $Xi,$T1 835 836 pclmulqdq \$0x00,$HK,$Xm 837 xorps $Xh,$Xhn 838 839 lea 0x40($inp),$inp 840 sub \$0x40,$len 841 jnc .Lmod4_loop 842 843.Ltail4x: 844 pclmulqdq \$0x00,$Hkey4,$Xi 845 pclmulqdq \$0x11,$Hkey4,$Xhi 846 pclmulqdq \$0x10,$HK,$T1 847 xorps $Xm,$Xmn 848 xorps $Xln,$Xi 849 xorps $Xhn,$Xhi 850 pxor $Xi,$Xhi # aggregated Karatsuba post-processing 851 pxor $Xmn,$T1 852 853 pxor $Xhi,$T1 # 854 pxor $Xi,$Xhi 855 856 movdqa $T1,$T2 # 857 psrldq \$8,$T1 858 pslldq \$8,$T2 # 859 pxor $T1,$Xhi 860 pxor $T2,$Xi # 861___ 862 &reduction_alg9($Xhi,$Xi); 863$code.=<<___; 864 add \$0x40,$len 865 jz .Ldone 866 movdqu 0x20($Htbl),$HK 867 sub \$0x10,$len 868 jz .Lodd_tail 869.Lskip4x: 870___ 871} 872$code.=<<___; 873 ####### 874 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 875 # [(H*Ii+1) + (H*Xi+1)] mod P = 876 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 877 # 878 movdqu ($inp),$T1 # Ii 879 movdqu 16($inp),$Xln # Ii+1 880 pshufb $T3,$T1 881 pshufb $T3,$Xln 882 pxor $T1,$Xi # Ii+Xi 883 884 movdqa $Xln,$Xhn 885 pshufd \$0b01001110,$Xln,$Xmn 886 pxor $Xln,$Xmn 887 pclmulqdq \$0x00,$Hkey,$Xln 888 pclmulqdq \$0x11,$Hkey,$Xhn 889 pclmulqdq \$0x00,$HK,$Xmn 890 891 lea 32($inp),$inp # i+=2 892 nop 893 sub \$0x20,$len 894 jbe .Leven_tail 895 nop 896 jmp .Lmod_loop 897 898.align 32 899.Lmod_loop: 900 movdqa $Xi,$Xhi 901 movdqa $Xmn,$T1 902 pshufd \$0b01001110,$Xi,$Xmn # 903 pxor $Xi,$Xmn # 904 905 pclmulqdq \$0x00,$Hkey2,$Xi 906 pclmulqdq \$0x11,$Hkey2,$Xhi 907 pclmulqdq \$0x10,$HK,$Xmn 908 909 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 910 pxor $Xhn,$Xhi 911 movdqu ($inp),$T2 # Ii 912 pxor $Xi,$T1 # aggregated Karatsuba post-processing 913 pshufb $T3,$T2 914 movdqu 16($inp),$Xln # Ii+1 915 916 pxor $Xhi,$T1 917 pxor $T2,$Xhi # "Ii+Xi", consume early 918 pxor $T1,$Xmn 919 pshufb $T3,$Xln 920 movdqa $Xmn,$T1 # 921 psrldq \$8,$T1 922 pslldq \$8,$Xmn # 923 pxor $T1,$Xhi 924 pxor $Xmn,$Xi # 925 926 movdqa $Xln,$Xhn # 927 928 movdqa $Xi,$T2 # 1st phase 929 movdqa $Xi,$T1 930 psllq \$5,$Xi 931 pxor $Xi,$T1 # 932 pclmulqdq \$0x00,$Hkey,$Xln ####### 933 psllq \$1,$Xi 934 pxor $T1,$Xi # 935 psllq \$57,$Xi # 936 movdqa $Xi,$T1 # 937 pslldq \$8,$Xi 938 psrldq \$8,$T1 # 939 pxor $T2,$Xi 940 pshufd \$0b01001110,$Xhn,$Xmn 941 pxor $T1,$Xhi # 942 pxor $Xhn,$Xmn # 943 944 movdqa $Xi,$T2 # 2nd phase 945 psrlq \$1,$Xi 946 pclmulqdq \$0x11,$Hkey,$Xhn ####### 947 pxor $T2,$Xhi # 948 pxor $Xi,$T2 949 psrlq \$5,$Xi 950 pxor $T2,$Xi # 951 lea 32($inp),$inp 952 psrlq \$1,$Xi # 953 pclmulqdq \$0x00,$HK,$Xmn ####### 954 pxor $Xhi,$Xi # 955 956 sub \$0x20,$len 957 ja .Lmod_loop 958 959.Leven_tail: 960 movdqa $Xi,$Xhi 961 movdqa $Xmn,$T1 962 pshufd \$0b01001110,$Xi,$Xmn # 963 pxor $Xi,$Xmn # 964 965 pclmulqdq \$0x00,$Hkey2,$Xi 966 pclmulqdq \$0x11,$Hkey2,$Xhi 967 pclmulqdq \$0x10,$HK,$Xmn 968 969 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 970 pxor $Xhn,$Xhi 971 pxor $Xi,$T1 972 pxor $Xhi,$T1 973 pxor $T1,$Xmn 974 movdqa $Xmn,$T1 # 975 psrldq \$8,$T1 976 pslldq \$8,$Xmn # 977 pxor $T1,$Xhi 978 pxor $Xmn,$Xi # 979___ 980 &reduction_alg9 ($Xhi,$Xi); 981$code.=<<___; 982 test $len,$len 983 jnz .Ldone 984 985.Lodd_tail: 986 movdqu ($inp),$T1 # Ii 987 pshufb $T3,$T1 988 pxor $T1,$Xi # Ii+Xi 989___ 990 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) 991 &reduction_alg9 ($Xhi,$Xi); 992$code.=<<___; 993.Ldone: 994 pshufb $T3,$Xi 995 movdqu $Xi,($Xip) 996___ 997$code.=<<___ if ($win64); 998 movaps (%rsp),%xmm6 999 movaps 0x10(%rsp),%xmm7 1000 movaps 0x20(%rsp),%xmm8 1001 movaps 0x30(%rsp),%xmm9 1002 movaps 0x40(%rsp),%xmm10 1003 movaps 0x50(%rsp),%xmm11 1004 movaps 0x60(%rsp),%xmm12 1005 movaps 0x70(%rsp),%xmm13 1006 movaps 0x80(%rsp),%xmm14 1007 movaps 0x90(%rsp),%xmm15 1008 lea 0xa8(%rsp),%rsp 1009.LSEH_end_gcm_ghash_clmul: 1010___ 1011$code.=<<___; 1012 ret 1013.cfi_endproc 1014.size gcm_ghash_clmul,.-gcm_ghash_clmul 1015___ 1016} 1017 1018$code.=<<___; 1019.globl gcm_init_avx 1020.type gcm_init_avx,\@abi-omnipotent 1021.align 32 1022gcm_init_avx: 1023.cfi_startproc 1024___ 1025if ($avx) { 1026my ($Htbl,$Xip)=@_4args; 1027my $HK="%xmm6"; 1028 1029$code.=<<___ if ($win64); 1030.LSEH_begin_gcm_init_avx: 1031 # I can't trust assembler to use specific encoding:-( 1032 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 1033 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 1034___ 1035$code.=<<___; 1036 vzeroupper 1037 1038 vmovdqu ($Xip),$Hkey 1039 vpshufd \$0b01001110,$Hkey,$Hkey # dword swap 1040 1041 # <<1 twist 1042 vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 1043 vpsrlq \$63,$Hkey,$T1 1044 vpsllq \$1,$Hkey,$Hkey 1045 vpxor $T3,$T3,$T3 # 1046 vpcmpgtd $T2,$T3,$T3 # broadcast carry bit 1047 vpslldq \$8,$T1,$T1 1048 vpor $T1,$Hkey,$Hkey # H<<=1 1049 1050 # magic reduction 1051 vpand .L0x1c2_polynomial(%rip),$T3,$T3 1052 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial 1053 1054 vpunpckhqdq $Hkey,$Hkey,$HK 1055 vmovdqa $Hkey,$Xi 1056 vpxor $Hkey,$HK,$HK 1057 mov \$4,%r10 # up to H^8 1058 jmp .Linit_start_avx 1059___ 1060 1061sub clmul64x64_avx { 1062my ($Xhi,$Xi,$Hkey,$HK)=@_; 1063 1064if (!defined($HK)) { $HK = $T2; 1065$code.=<<___; 1066 vpunpckhqdq $Xi,$Xi,$T1 1067 vpunpckhqdq $Hkey,$Hkey,$T2 1068 vpxor $Xi,$T1,$T1 # 1069 vpxor $Hkey,$T2,$T2 1070___ 1071} else { 1072$code.=<<___; 1073 vpunpckhqdq $Xi,$Xi,$T1 1074 vpxor $Xi,$T1,$T1 # 1075___ 1076} 1077$code.=<<___; 1078 vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### 1079 vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### 1080 vpclmulqdq \$0x00,$HK,$T1,$T1 ####### 1081 vpxor $Xi,$Xhi,$T2 # 1082 vpxor $T2,$T1,$T1 # 1083 1084 vpslldq \$8,$T1,$T2 # 1085 vpsrldq \$8,$T1,$T1 1086 vpxor $T2,$Xi,$Xi # 1087 vpxor $T1,$Xhi,$Xhi 1088___ 1089} 1090 1091sub reduction_avx { 1092my ($Xhi,$Xi) = @_; 1093 1094$code.=<<___; 1095 vpsllq \$57,$Xi,$T1 # 1st phase 1096 vpsllq \$62,$Xi,$T2 1097 vpxor $T1,$T2,$T2 # 1098 vpsllq \$63,$Xi,$T1 1099 vpxor $T1,$T2,$T2 # 1100 vpslldq \$8,$T2,$T1 # 1101 vpsrldq \$8,$T2,$T2 1102 vpxor $T1,$Xi,$Xi # 1103 vpxor $T2,$Xhi,$Xhi 1104 1105 vpsrlq \$1,$Xi,$T2 # 2nd phase 1106 vpxor $Xi,$Xhi,$Xhi 1107 vpxor $T2,$Xi,$Xi # 1108 vpsrlq \$5,$T2,$T2 1109 vpxor $T2,$Xi,$Xi # 1110 vpsrlq \$1,$Xi,$Xi # 1111 vpxor $Xhi,$Xi,$Xi # 1112___ 1113} 1114 1115$code.=<<___; 1116.align 32 1117.Linit_loop_avx: 1118 vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... 1119 vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" 1120___ 1121 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 1122 &reduction_avx ($Xhi,$Xi); 1123$code.=<<___; 1124.Linit_start_avx: 1125 vmovdqa $Xi,$T3 1126___ 1127 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 1128 &reduction_avx ($Xhi,$Xi); 1129$code.=<<___; 1130 vpshufd \$0b01001110,$T3,$T1 1131 vpshufd \$0b01001110,$Xi,$T2 1132 vpxor $T3,$T1,$T1 # Karatsuba pre-processing 1133 vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 1134 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing 1135 vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 1136 lea 0x30($Htbl),$Htbl 1137 sub \$1,%r10 1138 jnz .Linit_loop_avx 1139 1140 vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped 1141 vmovdqu $T3,-0x10($Htbl) 1142 1143 vzeroupper 1144___ 1145$code.=<<___ if ($win64); 1146 movaps (%rsp),%xmm6 1147 lea 0x18(%rsp),%rsp 1148.LSEH_end_gcm_init_avx: 1149___ 1150$code.=<<___; 1151 ret 1152.cfi_endproc 1153.size gcm_init_avx,.-gcm_init_avx 1154___ 1155} else { 1156$code.=<<___; 1157 jmp .L_init_clmul 1158.cfi_endproc 1159.size gcm_init_avx,.-gcm_init_avx 1160___ 1161} 1162 1163$code.=<<___; 1164.globl gcm_gmult_avx 1165.type gcm_gmult_avx,\@abi-omnipotent 1166.align 32 1167gcm_gmult_avx: 1168.cfi_startproc 1169 jmp .L_gmult_clmul 1170.cfi_endproc 1171.size gcm_gmult_avx,.-gcm_gmult_avx 1172___ 1173 1174$code.=<<___; 1175.globl gcm_ghash_avx 1176.type gcm_ghash_avx,\@abi-omnipotent 1177.align 32 1178gcm_ghash_avx: 1179.cfi_startproc 1180___ 1181if ($avx) { 1182my ($Xip,$Htbl,$inp,$len)=@_4args; 1183my ($Xlo,$Xhi,$Xmi, 1184 $Zlo,$Zhi,$Zmi, 1185 $Hkey,$HK,$T1,$T2, 1186 $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); 1187 1188$code.=<<___ if ($win64); 1189 lea -0x88(%rsp),%rax 1190.LSEH_begin_gcm_ghash_avx: 1191 # I can't trust assembler to use specific encoding:-( 1192 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 1193 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 1194 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 1195 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 1196 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 1197 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 1198 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 1199 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 1200 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 1201 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 1202 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 1203___ 1204$code.=<<___; 1205 vzeroupper 1206 1207 vmovdqu ($Xip),$Xi # load $Xi 1208 lea .L0x1c2_polynomial(%rip),%r10 1209 lea 0x40($Htbl),$Htbl # size optimization 1210 vmovdqu .Lbswap_mask(%rip),$bswap 1211 vpshufb $bswap,$Xi,$Xi 1212 cmp \$0x80,$len 1213 jb .Lshort_avx 1214 sub \$0x80,$len 1215 1216 vmovdqu 0x70($inp),$Ii # I[7] 1217 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1218 vpshufb $bswap,$Ii,$Ii 1219 vmovdqu 0x20-0x40($Htbl),$HK 1220 1221 vpunpckhqdq $Ii,$Ii,$T2 1222 vmovdqu 0x60($inp),$Ij # I[6] 1223 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1224 vpxor $Ii,$T2,$T2 1225 vpshufb $bswap,$Ij,$Ij 1226 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1227 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1228 vpunpckhqdq $Ij,$Ij,$T1 1229 vmovdqu 0x50($inp),$Ii # I[5] 1230 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1231 vpxor $Ij,$T1,$T1 1232 1233 vpshufb $bswap,$Ii,$Ii 1234 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1235 vpunpckhqdq $Ii,$Ii,$T2 1236 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1237 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1238 vpxor $Ii,$T2,$T2 1239 vmovdqu 0x40($inp),$Ij # I[4] 1240 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1241 vmovdqu 0x50-0x40($Htbl),$HK 1242 1243 vpshufb $bswap,$Ij,$Ij 1244 vpxor $Xlo,$Zlo,$Zlo 1245 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1246 vpxor $Xhi,$Zhi,$Zhi 1247 vpunpckhqdq $Ij,$Ij,$T1 1248 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1249 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1250 vpxor $Xmi,$Zmi,$Zmi 1251 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1252 vpxor $Ij,$T1,$T1 1253 1254 vmovdqu 0x30($inp),$Ii # I[3] 1255 vpxor $Zlo,$Xlo,$Xlo 1256 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1257 vpxor $Zhi,$Xhi,$Xhi 1258 vpshufb $bswap,$Ii,$Ii 1259 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1260 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1261 vpxor $Zmi,$Xmi,$Xmi 1262 vpunpckhqdq $Ii,$Ii,$T2 1263 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1264 vmovdqu 0x80-0x40($Htbl),$HK 1265 vpxor $Ii,$T2,$T2 1266 1267 vmovdqu 0x20($inp),$Ij # I[2] 1268 vpxor $Xlo,$Zlo,$Zlo 1269 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1270 vpxor $Xhi,$Zhi,$Zhi 1271 vpshufb $bswap,$Ij,$Ij 1272 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1273 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1274 vpxor $Xmi,$Zmi,$Zmi 1275 vpunpckhqdq $Ij,$Ij,$T1 1276 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1277 vpxor $Ij,$T1,$T1 1278 1279 vmovdqu 0x10($inp),$Ii # I[1] 1280 vpxor $Zlo,$Xlo,$Xlo 1281 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1282 vpxor $Zhi,$Xhi,$Xhi 1283 vpshufb $bswap,$Ii,$Ii 1284 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1285 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1286 vpxor $Zmi,$Xmi,$Xmi 1287 vpunpckhqdq $Ii,$Ii,$T2 1288 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1289 vmovdqu 0xb0-0x40($Htbl),$HK 1290 vpxor $Ii,$T2,$T2 1291 1292 vmovdqu ($inp),$Ij # I[0] 1293 vpxor $Xlo,$Zlo,$Zlo 1294 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1295 vpxor $Xhi,$Zhi,$Zhi 1296 vpshufb $bswap,$Ij,$Ij 1297 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1298 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1299 vpxor $Xmi,$Zmi,$Zmi 1300 vpclmulqdq \$0x10,$HK,$T2,$Xmi 1301 1302 lea 0x80($inp),$inp 1303 cmp \$0x80,$len 1304 jb .Ltail_avx 1305 1306 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1307 sub \$0x80,$len 1308 jmp .Loop8x_avx 1309 1310.align 32 1311.Loop8x_avx: 1312 vpunpckhqdq $Ij,$Ij,$T1 1313 vmovdqu 0x70($inp),$Ii # I[7] 1314 vpxor $Xlo,$Zlo,$Zlo 1315 vpxor $Ij,$T1,$T1 1316 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi 1317 vpshufb $bswap,$Ii,$Ii 1318 vpxor $Xhi,$Zhi,$Zhi 1319 vpclmulqdq \$0x11,$Hkey,$Ij,$Xo 1320 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1321 vpunpckhqdq $Ii,$Ii,$T2 1322 vpxor $Xmi,$Zmi,$Zmi 1323 vpclmulqdq \$0x00,$HK,$T1,$Tred 1324 vmovdqu 0x20-0x40($Htbl),$HK 1325 vpxor $Ii,$T2,$T2 1326 1327 vmovdqu 0x60($inp),$Ij # I[6] 1328 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1329 vpxor $Zlo,$Xi,$Xi # collect result 1330 vpshufb $bswap,$Ij,$Ij 1331 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1332 vxorps $Zhi,$Xo,$Xo 1333 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1334 vpunpckhqdq $Ij,$Ij,$T1 1335 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1336 vpxor $Zmi,$Tred,$Tred 1337 vxorps $Ij,$T1,$T1 1338 1339 vmovdqu 0x50($inp),$Ii # I[5] 1340 vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing 1341 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1342 vpxor $Xo,$Tred,$Tred 1343 vpslldq \$8,$Tred,$T2 1344 vpxor $Xlo,$Zlo,$Zlo 1345 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1346 vpsrldq \$8,$Tred,$Tred 1347 vpxor $T2, $Xi, $Xi 1348 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1349 vpshufb $bswap,$Ii,$Ii 1350 vxorps $Tred,$Xo, $Xo 1351 vpxor $Xhi,$Zhi,$Zhi 1352 vpunpckhqdq $Ii,$Ii,$T2 1353 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1354 vmovdqu 0x50-0x40($Htbl),$HK 1355 vpxor $Ii,$T2,$T2 1356 vpxor $Xmi,$Zmi,$Zmi 1357 1358 vmovdqu 0x40($inp),$Ij # I[4] 1359 vpalignr \$8,$Xi,$Xi,$Tred # 1st phase 1360 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1361 vpshufb $bswap,$Ij,$Ij 1362 vpxor $Zlo,$Xlo,$Xlo 1363 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1364 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1365 vpunpckhqdq $Ij,$Ij,$T1 1366 vpxor $Zhi,$Xhi,$Xhi 1367 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1368 vxorps $Ij,$T1,$T1 1369 vpxor $Zmi,$Xmi,$Xmi 1370 1371 vmovdqu 0x30($inp),$Ii # I[3] 1372 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1373 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1374 vpshufb $bswap,$Ii,$Ii 1375 vpxor $Xlo,$Zlo,$Zlo 1376 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1377 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1378 vpunpckhqdq $Ii,$Ii,$T2 1379 vpxor $Xhi,$Zhi,$Zhi 1380 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1381 vmovdqu 0x80-0x40($Htbl),$HK 1382 vpxor $Ii,$T2,$T2 1383 vpxor $Xmi,$Zmi,$Zmi 1384 1385 vmovdqu 0x20($inp),$Ij # I[2] 1386 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1387 vpshufb $bswap,$Ij,$Ij 1388 vpxor $Zlo,$Xlo,$Xlo 1389 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1390 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1391 vpunpckhqdq $Ij,$Ij,$T1 1392 vpxor $Zhi,$Xhi,$Xhi 1393 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1394 vpxor $Ij,$T1,$T1 1395 vpxor $Zmi,$Xmi,$Xmi 1396 vxorps $Tred,$Xi,$Xi 1397 1398 vmovdqu 0x10($inp),$Ii # I[1] 1399 vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase 1400 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1401 vpshufb $bswap,$Ii,$Ii 1402 vpxor $Xlo,$Zlo,$Zlo 1403 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1404 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1405 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1406 vxorps $Xo,$Tred,$Tred 1407 vpunpckhqdq $Ii,$Ii,$T2 1408 vpxor $Xhi,$Zhi,$Zhi 1409 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1410 vmovdqu 0xb0-0x40($Htbl),$HK 1411 vpxor $Ii,$T2,$T2 1412 vpxor $Xmi,$Zmi,$Zmi 1413 1414 vmovdqu ($inp),$Ij # I[0] 1415 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1416 vpshufb $bswap,$Ij,$Ij 1417 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1418 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1419 vpxor $Tred,$Ij,$Ij 1420 vpclmulqdq \$0x10,$HK, $T2,$Xmi 1421 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1422 1423 lea 0x80($inp),$inp 1424 sub \$0x80,$len 1425 jnc .Loop8x_avx 1426 1427 add \$0x80,$len 1428 jmp .Ltail_no_xor_avx 1429 1430.align 32 1431.Lshort_avx: 1432 vmovdqu -0x10($inp,$len),$Ii # very last word 1433 lea ($inp,$len),$inp 1434 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1435 vmovdqu 0x20-0x40($Htbl),$HK 1436 vpshufb $bswap,$Ii,$Ij 1437 1438 vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, 1439 vmovdqa $Xhi,$Zhi # $Zhi and 1440 vmovdqa $Xmi,$Zmi # $Zmi 1441 sub \$0x10,$len 1442 jz .Ltail_avx 1443 1444 vpunpckhqdq $Ij,$Ij,$T1 1445 vpxor $Xlo,$Zlo,$Zlo 1446 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1447 vpxor $Ij,$T1,$T1 1448 vmovdqu -0x20($inp),$Ii 1449 vpxor $Xhi,$Zhi,$Zhi 1450 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1451 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1452 vpshufb $bswap,$Ii,$Ij 1453 vpxor $Xmi,$Zmi,$Zmi 1454 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1455 vpsrldq \$8,$HK,$HK 1456 sub \$0x10,$len 1457 jz .Ltail_avx 1458 1459 vpunpckhqdq $Ij,$Ij,$T1 1460 vpxor $Xlo,$Zlo,$Zlo 1461 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1462 vpxor $Ij,$T1,$T1 1463 vmovdqu -0x30($inp),$Ii 1464 vpxor $Xhi,$Zhi,$Zhi 1465 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1466 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1467 vpshufb $bswap,$Ii,$Ij 1468 vpxor $Xmi,$Zmi,$Zmi 1469 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1470 vmovdqu 0x50-0x40($Htbl),$HK 1471 sub \$0x10,$len 1472 jz .Ltail_avx 1473 1474 vpunpckhqdq $Ij,$Ij,$T1 1475 vpxor $Xlo,$Zlo,$Zlo 1476 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1477 vpxor $Ij,$T1,$T1 1478 vmovdqu -0x40($inp),$Ii 1479 vpxor $Xhi,$Zhi,$Zhi 1480 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1481 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1482 vpshufb $bswap,$Ii,$Ij 1483 vpxor $Xmi,$Zmi,$Zmi 1484 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1485 vpsrldq \$8,$HK,$HK 1486 sub \$0x10,$len 1487 jz .Ltail_avx 1488 1489 vpunpckhqdq $Ij,$Ij,$T1 1490 vpxor $Xlo,$Zlo,$Zlo 1491 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1492 vpxor $Ij,$T1,$T1 1493 vmovdqu -0x50($inp),$Ii 1494 vpxor $Xhi,$Zhi,$Zhi 1495 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1496 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1497 vpshufb $bswap,$Ii,$Ij 1498 vpxor $Xmi,$Zmi,$Zmi 1499 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1500 vmovdqu 0x80-0x40($Htbl),$HK 1501 sub \$0x10,$len 1502 jz .Ltail_avx 1503 1504 vpunpckhqdq $Ij,$Ij,$T1 1505 vpxor $Xlo,$Zlo,$Zlo 1506 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1507 vpxor $Ij,$T1,$T1 1508 vmovdqu -0x60($inp),$Ii 1509 vpxor $Xhi,$Zhi,$Zhi 1510 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1511 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1512 vpshufb $bswap,$Ii,$Ij 1513 vpxor $Xmi,$Zmi,$Zmi 1514 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1515 vpsrldq \$8,$HK,$HK 1516 sub \$0x10,$len 1517 jz .Ltail_avx 1518 1519 vpunpckhqdq $Ij,$Ij,$T1 1520 vpxor $Xlo,$Zlo,$Zlo 1521 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1522 vpxor $Ij,$T1,$T1 1523 vmovdqu -0x70($inp),$Ii 1524 vpxor $Xhi,$Zhi,$Zhi 1525 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1526 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1527 vpshufb $bswap,$Ii,$Ij 1528 vpxor $Xmi,$Zmi,$Zmi 1529 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1530 vmovq 0xb8-0x40($Htbl),$HK 1531 sub \$0x10,$len 1532 jmp .Ltail_avx 1533 1534.align 32 1535.Ltail_avx: 1536 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1537.Ltail_no_xor_avx: 1538 vpunpckhqdq $Ij,$Ij,$T1 1539 vpxor $Xlo,$Zlo,$Zlo 1540 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1541 vpxor $Ij,$T1,$T1 1542 vpxor $Xhi,$Zhi,$Zhi 1543 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1544 vpxor $Xmi,$Zmi,$Zmi 1545 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1546 1547 vmovdqu (%r10),$Tred 1548 1549 vpxor $Xlo,$Zlo,$Xi 1550 vpxor $Xhi,$Zhi,$Xo 1551 vpxor $Xmi,$Zmi,$Zmi 1552 1553 vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing 1554 vpxor $Xo, $Zmi,$Zmi 1555 vpslldq \$8, $Zmi,$T2 1556 vpsrldq \$8, $Zmi,$Zmi 1557 vpxor $T2, $Xi, $Xi 1558 vpxor $Zmi,$Xo, $Xo 1559 1560 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase 1561 vpalignr \$8,$Xi,$Xi,$Xi 1562 vpxor $T2,$Xi,$Xi 1563 1564 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase 1565 vpalignr \$8,$Xi,$Xi,$Xi 1566 vpxor $Xo,$Xi,$Xi 1567 vpxor $T2,$Xi,$Xi 1568 1569 cmp \$0,$len 1570 jne .Lshort_avx 1571 1572 vpshufb $bswap,$Xi,$Xi 1573 vmovdqu $Xi,($Xip) 1574 vzeroupper 1575___ 1576$code.=<<___ if ($win64); 1577 movaps (%rsp),%xmm6 1578 movaps 0x10(%rsp),%xmm7 1579 movaps 0x20(%rsp),%xmm8 1580 movaps 0x30(%rsp),%xmm9 1581 movaps 0x40(%rsp),%xmm10 1582 movaps 0x50(%rsp),%xmm11 1583 movaps 0x60(%rsp),%xmm12 1584 movaps 0x70(%rsp),%xmm13 1585 movaps 0x80(%rsp),%xmm14 1586 movaps 0x90(%rsp),%xmm15 1587 lea 0xa8(%rsp),%rsp 1588.LSEH_end_gcm_ghash_avx: 1589___ 1590$code.=<<___; 1591 ret 1592.cfi_endproc 1593.size gcm_ghash_avx,.-gcm_ghash_avx 1594___ 1595} else { 1596$code.=<<___; 1597 jmp .L_ghash_clmul 1598.cfi_endproc 1599.size gcm_ghash_avx,.-gcm_ghash_avx 1600___ 1601} 1602 1603$code.=<<___; 1604.align 64 1605.Lbswap_mask: 1606 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1607.L0x1c2_polynomial: 1608 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1609.L7_mask: 1610 .long 7,0,7,0 1611.L7_mask_poly: 1612 .long 7,0,`0xE1<<1`,0 1613.align 64 1614.type .Lrem_4bit,\@object 1615.Lrem_4bit: 1616 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` 1617 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` 1618 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` 1619 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` 1620.type .Lrem_8bit,\@object 1621.Lrem_8bit: 1622 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E 1623 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E 1624 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E 1625 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E 1626 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E 1627 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E 1628 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E 1629 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E 1630 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE 1631 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE 1632 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE 1633 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE 1634 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E 1635 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E 1636 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE 1637 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE 1638 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E 1639 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E 1640 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E 1641 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E 1642 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E 1643 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E 1644 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E 1645 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E 1646 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE 1647 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE 1648 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE 1649 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE 1650 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E 1651 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E 1652 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE 1653 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE 1654 1655.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1656.align 64 1657___ 1658 1659# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1660# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1661if ($win64) { 1662$rec="%rcx"; 1663$frame="%rdx"; 1664$context="%r8"; 1665$disp="%r9"; 1666 1667$code.=<<___; 1668.extern __imp_RtlVirtualUnwind 1669.type se_handler,\@abi-omnipotent 1670.align 16 1671se_handler: 1672 push %rsi 1673 push %rdi 1674 push %rbx 1675 push %rbp 1676 push %r12 1677 push %r13 1678 push %r14 1679 push %r15 1680 pushfq 1681 sub \$64,%rsp 1682 1683 mov 120($context),%rax # pull context->Rax 1684 mov 248($context),%rbx # pull context->Rip 1685 1686 mov 8($disp),%rsi # disp->ImageBase 1687 mov 56($disp),%r11 # disp->HandlerData 1688 1689 mov 0(%r11),%r10d # HandlerData[0] 1690 lea (%rsi,%r10),%r10 # prologue label 1691 cmp %r10,%rbx # context->Rip<prologue label 1692 jb .Lin_prologue 1693 1694 mov 152($context),%rax # pull context->Rsp 1695 1696 mov 4(%r11),%r10d # HandlerData[1] 1697 lea (%rsi,%r10),%r10 # epilogue label 1698 cmp %r10,%rbx # context->Rip>=epilogue label 1699 jae .Lin_prologue 1700 1701 lea 48+280(%rax),%rax # adjust "rsp" 1702 1703 mov -8(%rax),%rbx 1704 mov -16(%rax),%rbp 1705 mov -24(%rax),%r12 1706 mov -32(%rax),%r13 1707 mov -40(%rax),%r14 1708 mov -48(%rax),%r15 1709 mov %rbx,144($context) # restore context->Rbx 1710 mov %rbp,160($context) # restore context->Rbp 1711 mov %r12,216($context) # restore context->R12 1712 mov %r13,224($context) # restore context->R13 1713 mov %r14,232($context) # restore context->R14 1714 mov %r15,240($context) # restore context->R15 1715 1716.Lin_prologue: 1717 mov 8(%rax),%rdi 1718 mov 16(%rax),%rsi 1719 mov %rax,152($context) # restore context->Rsp 1720 mov %rsi,168($context) # restore context->Rsi 1721 mov %rdi,176($context) # restore context->Rdi 1722 1723 mov 40($disp),%rdi # disp->ContextRecord 1724 mov $context,%rsi # context 1725 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 1726 .long 0xa548f3fc # cld; rep movsq 1727 1728 mov $disp,%rsi 1729 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1730 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1731 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1732 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1733 mov 40(%rsi),%r10 # disp->ContextRecord 1734 lea 56(%rsi),%r11 # &disp->HandlerData 1735 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1736 mov %r10,32(%rsp) # arg5 1737 mov %r11,40(%rsp) # arg6 1738 mov %r12,48(%rsp) # arg7 1739 mov %rcx,56(%rsp) # arg8, (NULL) 1740 call *__imp_RtlVirtualUnwind(%rip) 1741 1742 mov \$1,%eax # ExceptionContinueSearch 1743 add \$64,%rsp 1744 popfq 1745 pop %r15 1746 pop %r14 1747 pop %r13 1748 pop %r12 1749 pop %rbp 1750 pop %rbx 1751 pop %rdi 1752 pop %rsi 1753 ret 1754.size se_handler,.-se_handler 1755 1756.section .pdata 1757.align 4 1758 .rva .LSEH_begin_gcm_gmult_4bit 1759 .rva .LSEH_end_gcm_gmult_4bit 1760 .rva .LSEH_info_gcm_gmult_4bit 1761 1762 .rva .LSEH_begin_gcm_ghash_4bit 1763 .rva .LSEH_end_gcm_ghash_4bit 1764 .rva .LSEH_info_gcm_ghash_4bit 1765 1766 .rva .LSEH_begin_gcm_init_clmul 1767 .rva .LSEH_end_gcm_init_clmul 1768 .rva .LSEH_info_gcm_init_clmul 1769 1770 .rva .LSEH_begin_gcm_ghash_clmul 1771 .rva .LSEH_end_gcm_ghash_clmul 1772 .rva .LSEH_info_gcm_ghash_clmul 1773___ 1774$code.=<<___ if ($avx); 1775 .rva .LSEH_begin_gcm_init_avx 1776 .rva .LSEH_end_gcm_init_avx 1777 .rva .LSEH_info_gcm_init_clmul 1778 1779 .rva .LSEH_begin_gcm_ghash_avx 1780 .rva .LSEH_end_gcm_ghash_avx 1781 .rva .LSEH_info_gcm_ghash_clmul 1782___ 1783$code.=<<___; 1784.section .xdata 1785.align 8 1786.LSEH_info_gcm_gmult_4bit: 1787 .byte 9,0,0,0 1788 .rva se_handler 1789 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData 1790.LSEH_info_gcm_ghash_4bit: 1791 .byte 9,0,0,0 1792 .rva se_handler 1793 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData 1794.LSEH_info_gcm_init_clmul: 1795 .byte 0x01,0x08,0x03,0x00 1796 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1797 .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18 1798.LSEH_info_gcm_ghash_clmul: 1799 .byte 0x01,0x33,0x16,0x00 1800 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 1801 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 1802 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 1803 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 1804 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 1805 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 1806 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 1807 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 1808 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 1809 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1810 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 1811___ 1812} 1813 1814$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1815 1816print $code; 1817 1818close STDOUT or die "error closing STDOUT: $!"; 1819