1#! /usr/bin/env perl 2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# March, June 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that 21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH 22# function features so called "528B" variant utilizing additional 23# 256+16 bytes of per-key storage [+512 bytes shared table]. 24# Performance results are for this streamed GHASH subroutine and are 25# expressed in cycles per processed byte, less is better: 26# 27# gcc 3.4.x(*) assembler 28# 29# P4 28.6 14.0 +100% 30# Opteron 19.3 7.7 +150% 31# Core2 17.8 8.1(**) +120% 32# Atom 31.6 16.8 +88% 33# VIA Nano 21.8 10.1 +115% 34# 35# (*) comparison is not completely fair, because C results are 36# for vanilla "256B" implementation, while assembler results 37# are for "528B";-) 38# (**) it's mystery [to me] why Core2 result is not same as for 39# Opteron; 40 41# May 2010 42# 43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. 44# See ghash-x86.pl for background information and details about coding 45# techniques. 46# 47# Special thanks to David Woodhouse for providing access to a 48# Westmere-based system on behalf of Intel Open Source Technology Centre. 49 50# December 2012 51# 52# Overhaul: aggregate Karatsuba post-processing, improve ILP in 53# reduction_alg9, increase reduction aggregate factor to 4x. As for 54# the latter. ghash-x86.pl discusses that it makes lesser sense to 55# increase aggregate factor. Then why increase here? Critical path 56# consists of 3 independent pclmulqdq instructions, Karatsuba post- 57# processing and reduction. "On top" of this we lay down aggregated 58# multiplication operations, triplets of independent pclmulqdq's. As 59# issue rate for pclmulqdq is limited, it makes lesser sense to 60# aggregate more multiplications than it takes to perform remaining 61# non-multiplication operations. 2x is near-optimal coefficient for 62# contemporary Intel CPUs (therefore modest improvement coefficient), 63# but not for Bulldozer. Latter is because logical SIMD operations 64# are twice as slow in comparison to Intel, so that critical path is 65# longer. A CPU with higher pclmulqdq issue rate would also benefit 66# from higher aggregate factor... 67# 68# Westmere 1.78(+13%) 69# Sandy Bridge 1.80(+8%) 70# Ivy Bridge 1.80(+7%) 71# Haswell 0.55(+93%) (if system doesn't support AVX) 72# Broadwell 0.45(+110%)(if system doesn't support AVX) 73# Skylake 0.44(+110%)(if system doesn't support AVX) 74# Bulldozer 1.49(+27%) 75# Silvermont 2.88(+13%) 76# Knights L 2.12(-) (if system doesn't support AVX) 77# Goldmont 1.08(+24%) 78 79# March 2013 80# 81# ... 8x aggregate factor AVX code path is using reduction algorithm 82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable 83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs 84# sub-optimally in comparison to above mentioned version. But thanks 85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that 86# it performs in 0.41 cycles per byte on Haswell processor, in 87# 0.29 on Broadwell, and in 0.36 on Skylake. 88# 89# Knights Landing achieves 1.09 cpb. 90# 91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 92 93$flavour = shift; 94$output = shift; 95if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 96 97$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 98 99$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 100( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 101( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 102die "can't locate x86_64-xlate.pl"; 103 104if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 105 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 106 $avx = ($1>=2.20) + ($1>=2.22); 107} 108 109if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 110 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 111 $avx = ($1>=2.09) + ($1>=2.10); 112} 113 114if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 115 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 116 $avx = ($1>=10) + ($1>=11); 117} 118 119if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 120 $avx = ($2>=3.0) + ($2>3.0); 121} 122 123open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 124*STDOUT=*OUT; 125 126$do4xaggr=1; 127 128# common register layout 129$nlo="%rax"; 130$nhi="%rbx"; 131$Zlo="%r8"; 132$Zhi="%r9"; 133$tmp="%r10"; 134$rem_4bit = "%r11"; 135 136$Xi="%rdi"; 137$Htbl="%rsi"; 138 139# per-function register layout 140$cnt="%rcx"; 141$rem="%rdx"; 142 143sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or 144 $r =~ s/%[er]([sd]i)/%\1l/ or 145 $r =~ s/%[er](bp)/%\1l/ or 146 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 147 148sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 149{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 150 my $arg = pop; 151 $arg = "\$$arg" if ($arg*1 eq $arg); 152 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 153} 154 155{ my $N; 156 sub loop() { 157 my $inp = shift; 158 159 $N++; 160$code.=<<___; 161 xor $nlo,$nlo 162 xor $nhi,$nhi 163 mov `&LB("$Zlo")`,`&LB("$nlo")` 164 mov `&LB("$Zlo")`,`&LB("$nhi")` 165 shl \$4,`&LB("$nlo")` 166 mov \$14,$cnt 167 mov 8($Htbl,$nlo),$Zlo 168 mov ($Htbl,$nlo),$Zhi 169 and \$0xf0,`&LB("$nhi")` 170 mov $Zlo,$rem 171 jmp .Loop$N 172 173.align 16 174.Loop$N: 175 shr \$4,$Zlo 176 and \$0xf,$rem 177 mov $Zhi,$tmp 178 mov ($inp,$cnt),`&LB("$nlo")` 179 shr \$4,$Zhi 180 xor 8($Htbl,$nhi),$Zlo 181 shl \$60,$tmp 182 xor ($Htbl,$nhi),$Zhi 183 mov `&LB("$nlo")`,`&LB("$nhi")` 184 xor ($rem_4bit,$rem,8),$Zhi 185 mov $Zlo,$rem 186 shl \$4,`&LB("$nlo")` 187 xor $tmp,$Zlo 188 dec $cnt 189 js .Lbreak$N 190 191 shr \$4,$Zlo 192 and \$0xf,$rem 193 mov $Zhi,$tmp 194 shr \$4,$Zhi 195 xor 8($Htbl,$nlo),$Zlo 196 shl \$60,$tmp 197 xor ($Htbl,$nlo),$Zhi 198 and \$0xf0,`&LB("$nhi")` 199 xor ($rem_4bit,$rem,8),$Zhi 200 mov $Zlo,$rem 201 xor $tmp,$Zlo 202 jmp .Loop$N 203 204.align 16 205.Lbreak$N: 206 shr \$4,$Zlo 207 and \$0xf,$rem 208 mov $Zhi,$tmp 209 shr \$4,$Zhi 210 xor 8($Htbl,$nlo),$Zlo 211 shl \$60,$tmp 212 xor ($Htbl,$nlo),$Zhi 213 and \$0xf0,`&LB("$nhi")` 214 xor ($rem_4bit,$rem,8),$Zhi 215 mov $Zlo,$rem 216 xor $tmp,$Zlo 217 218 shr \$4,$Zlo 219 and \$0xf,$rem 220 mov $Zhi,$tmp 221 shr \$4,$Zhi 222 xor 8($Htbl,$nhi),$Zlo 223 shl \$60,$tmp 224 xor ($Htbl,$nhi),$Zhi 225 xor $tmp,$Zlo 226 xor ($rem_4bit,$rem,8),$Zhi 227 228 bswap $Zlo 229 bswap $Zhi 230___ 231}} 232 233$code=<<___; 234.text 235.extern OPENSSL_ia32cap_P 236 237.globl gcm_gmult_4bit 238.type gcm_gmult_4bit,\@function,2 239.align 16 240gcm_gmult_4bit: 241.cfi_startproc 242 push %rbx 243.cfi_push %rbx 244 push %rbp # %rbp and others are pushed exclusively in 245.cfi_push %rbp 246 push %r12 # order to reuse Win64 exception handler... 247.cfi_push %r12 248 push %r13 249.cfi_push %r13 250 push %r14 251.cfi_push %r14 252 push %r15 253.cfi_push %r15 254 sub \$280,%rsp 255.cfi_adjust_cfa_offset 280 256.Lgmult_prologue: 257 258 movzb 15($Xi),$Zlo 259 lea .Lrem_4bit(%rip),$rem_4bit 260___ 261 &loop ($Xi); 262$code.=<<___; 263 mov $Zlo,8($Xi) 264 mov $Zhi,($Xi) 265 266 lea 280+48(%rsp),%rsi 267.cfi_def_cfa %rsi,8 268 mov -8(%rsi),%rbx 269.cfi_restore %rbx 270 lea (%rsi),%rsp 271.cfi_def_cfa_register %rsp 272.Lgmult_epilogue: 273 ret 274.cfi_endproc 275.size gcm_gmult_4bit,.-gcm_gmult_4bit 276___ 277 278# per-function register layout 279$inp="%rdx"; 280$len="%rcx"; 281$rem_8bit=$rem_4bit; 282 283$code.=<<___; 284.globl gcm_ghash_4bit 285.type gcm_ghash_4bit,\@function,4 286.align 16 287gcm_ghash_4bit: 288.cfi_startproc 289 push %rbx 290.cfi_push %rbx 291 push %rbp 292.cfi_push %rbp 293 push %r12 294.cfi_push %r12 295 push %r13 296.cfi_push %r13 297 push %r14 298.cfi_push %r14 299 push %r15 300.cfi_push %r15 301 sub \$280,%rsp 302.cfi_adjust_cfa_offset 280 303.Lghash_prologue: 304 mov $inp,%r14 # reassign couple of args 305 mov $len,%r15 306___ 307{ my $inp="%r14"; 308 my $dat="%edx"; 309 my $len="%r15"; 310 my @nhi=("%ebx","%ecx"); 311 my @rem=("%r12","%r13"); 312 my $Hshr4="%rbp"; 313 314 &sub ($Htbl,-128); # size optimization 315 &lea ($Hshr4,"16+128(%rsp)"); 316 { my @lo =($nlo,$nhi); 317 my @hi =($Zlo,$Zhi); 318 319 &xor ($dat,$dat); 320 for ($i=0,$j=-2;$i<18;$i++,$j++) { 321 &mov ("$j(%rsp)",&LB($dat)) if ($i>1); 322 &or ($lo[0],$tmp) if ($i>1); 323 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); 324 &shr ($lo[1],4) if ($i>0 && $i<17); 325 &mov ($tmp,$hi[1]) if ($i>0 && $i<17); 326 &shr ($hi[1],4) if ($i>0 && $i<17); 327 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); 328 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); 329 &shl (&LB($dat),4) if ($i>0 && $i<17); 330 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); 331 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); 332 &shl ($tmp,60) if ($i>0 && $i<17); 333 334 push (@lo,shift(@lo)); 335 push (@hi,shift(@hi)); 336 } 337 } 338 &add ($Htbl,-128); 339 &mov ($Zlo,"8($Xi)"); 340 &mov ($Zhi,"0($Xi)"); 341 &add ($len,$inp); # pointer to the end of data 342 &lea ($rem_8bit,".Lrem_8bit(%rip)"); 343 &jmp (".Louter_loop"); 344 345$code.=".align 16\n.Louter_loop:\n"; 346 &xor ($Zhi,"($inp)"); 347 &mov ("%rdx","8($inp)"); 348 &lea ($inp,"16($inp)"); 349 &xor ("%rdx",$Zlo); 350 &mov ("($Xi)",$Zhi); 351 &mov ("8($Xi)","%rdx"); 352 &shr ("%rdx",32); 353 354 &xor ($nlo,$nlo); 355 &rol ($dat,8); 356 &mov (&LB($nlo),&LB($dat)); 357 &movz ($nhi[0],&LB($dat)); 358 &shl (&LB($nlo),4); 359 &shr ($nhi[0],4); 360 361 for ($j=11,$i=0;$i<15;$i++) { 362 &rol ($dat,8); 363 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); 364 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); 365 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); 366 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); 367 368 &mov (&LB($nlo),&LB($dat)); 369 &xor ($Zlo,$tmp) if ($i>0); 370 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); 371 372 &movz ($nhi[1],&LB($dat)); 373 &shl (&LB($nlo),4); 374 &movzb ($rem[0],"(%rsp,$nhi[0])"); 375 376 &shr ($nhi[1],4) if ($i<14); 377 &and ($nhi[1],0xf0) if ($i==14); 378 &shl ($rem[1],48) if ($i>0); 379 &xor ($rem[0],$Zlo); 380 381 &mov ($tmp,$Zhi); 382 &xor ($Zhi,$rem[1]) if ($i>0); 383 &shr ($Zlo,8); 384 385 &movz ($rem[0],&LB($rem[0])); 386 &mov ($dat,"$j($Xi)") if (--$j%4==0); 387 &shr ($Zhi,8); 388 389 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); 390 &shl ($tmp,56); 391 &xor ($Zhi,"($Hshr4,$nhi[0],8)"); 392 393 unshift (@nhi,pop(@nhi)); # "rotate" registers 394 unshift (@rem,pop(@rem)); 395 } 396 &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); 397 &xor ($Zlo,"8($Htbl,$nlo)"); 398 &xor ($Zhi,"($Htbl,$nlo)"); 399 400 &shl ($rem[1],48); 401 &xor ($Zlo,$tmp); 402 403 &xor ($Zhi,$rem[1]); 404 &movz ($rem[0],&LB($Zlo)); 405 &shr ($Zlo,4); 406 407 &mov ($tmp,$Zhi); 408 &shl (&LB($rem[0]),4); 409 &shr ($Zhi,4); 410 411 &xor ($Zlo,"8($Htbl,$nhi[0])"); 412 &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); 413 &shl ($tmp,60); 414 415 &xor ($Zhi,"($Htbl,$nhi[0])"); 416 &xor ($Zlo,$tmp); 417 &shl ($rem[0],48); 418 419 &bswap ($Zlo); 420 &xor ($Zhi,$rem[0]); 421 422 &bswap ($Zhi); 423 &cmp ($inp,$len); 424 &jb (".Louter_loop"); 425} 426$code.=<<___; 427 mov $Zlo,8($Xi) 428 mov $Zhi,($Xi) 429 430 lea 280+48(%rsp),%rsi 431.cfi_def_cfa %rsi,8 432 mov -48(%rsi),%r15 433.cfi_restore %r15 434 mov -40(%rsi),%r14 435.cfi_restore %r14 436 mov -32(%rsi),%r13 437.cfi_restore %r13 438 mov -24(%rsi),%r12 439.cfi_restore %r12 440 mov -16(%rsi),%rbp 441.cfi_restore %rbp 442 mov -8(%rsi),%rbx 443.cfi_restore %rbx 444 lea 0(%rsi),%rsp 445.cfi_def_cfa_register %rsp 446.Lghash_epilogue: 447 ret 448.cfi_endproc 449.size gcm_ghash_4bit,.-gcm_ghash_4bit 450___ 451 452###################################################################### 453# PCLMULQDQ version. 454 455@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 456 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 457 458($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; 459($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); 460 461sub clmul64x64_T2 { # minimal register pressure 462my ($Xhi,$Xi,$Hkey,$HK)=@_; 463 464if (!defined($HK)) { $HK = $T2; 465$code.=<<___; 466 movdqa $Xi,$Xhi # 467 pshufd \$0b01001110,$Xi,$T1 468 pshufd \$0b01001110,$Hkey,$T2 469 pxor $Xi,$T1 # 470 pxor $Hkey,$T2 471___ 472} else { 473$code.=<<___; 474 movdqa $Xi,$Xhi # 475 pshufd \$0b01001110,$Xi,$T1 476 pxor $Xi,$T1 # 477___ 478} 479$code.=<<___; 480 pclmulqdq \$0x00,$Hkey,$Xi ####### 481 pclmulqdq \$0x11,$Hkey,$Xhi ####### 482 pclmulqdq \$0x00,$HK,$T1 ####### 483 pxor $Xi,$T1 # 484 pxor $Xhi,$T1 # 485 486 movdqa $T1,$T2 # 487 psrldq \$8,$T1 488 pslldq \$8,$T2 # 489 pxor $T1,$Xhi 490 pxor $T2,$Xi # 491___ 492} 493 494sub reduction_alg9 { # 17/11 times faster than Intel version 495my ($Xhi,$Xi) = @_; 496 497$code.=<<___; 498 # 1st phase 499 movdqa $Xi,$T2 # 500 movdqa $Xi,$T1 501 psllq \$5,$Xi 502 pxor $Xi,$T1 # 503 psllq \$1,$Xi 504 pxor $T1,$Xi # 505 psllq \$57,$Xi # 506 movdqa $Xi,$T1 # 507 pslldq \$8,$Xi 508 psrldq \$8,$T1 # 509 pxor $T2,$Xi 510 pxor $T1,$Xhi # 511 512 # 2nd phase 513 movdqa $Xi,$T2 514 psrlq \$1,$Xi 515 pxor $T2,$Xhi # 516 pxor $Xi,$T2 517 psrlq \$5,$Xi 518 pxor $T2,$Xi # 519 psrlq \$1,$Xi # 520 pxor $Xhi,$Xi # 521___ 522} 523 524{ my ($Htbl,$Xip)=@_4args; 525 my $HK="%xmm6"; 526 527$code.=<<___; 528.globl gcm_init_clmul 529.type gcm_init_clmul,\@abi-omnipotent 530.align 16 531gcm_init_clmul: 532.L_init_clmul: 533___ 534$code.=<<___ if ($win64); 535.LSEH_begin_gcm_init_clmul: 536 # I can't trust assembler to use specific encoding:-( 537 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 538 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 539___ 540$code.=<<___; 541 movdqu ($Xip),$Hkey 542 pshufd \$0b01001110,$Hkey,$Hkey # dword swap 543 544 # <<1 twist 545 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 546 movdqa $Hkey,$T1 547 psllq \$1,$Hkey 548 pxor $T3,$T3 # 549 psrlq \$63,$T1 550 pcmpgtd $T2,$T3 # broadcast carry bit 551 pslldq \$8,$T1 552 por $T1,$Hkey # H<<=1 553 554 # magic reduction 555 pand .L0x1c2_polynomial(%rip),$T3 556 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial 557 558 # calculate H^2 559 pshufd \$0b01001110,$Hkey,$HK 560 movdqa $Hkey,$Xi 561 pxor $Hkey,$HK 562___ 563 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); 564 &reduction_alg9 ($Xhi,$Xi); 565$code.=<<___; 566 pshufd \$0b01001110,$Hkey,$T1 567 pshufd \$0b01001110,$Xi,$T2 568 pxor $Hkey,$T1 # Karatsuba pre-processing 569 movdqu $Hkey,0x00($Htbl) # save H 570 pxor $Xi,$T2 # Karatsuba pre-processing 571 movdqu $Xi,0x10($Htbl) # save H^2 572 palignr \$8,$T1,$T2 # low part is H.lo^H.hi... 573 movdqu $T2,0x20($Htbl) # save Karatsuba "salt" 574___ 575if ($do4xaggr) { 576 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 577 &reduction_alg9 ($Xhi,$Xi); 578$code.=<<___; 579 movdqa $Xi,$T3 580___ 581 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 582 &reduction_alg9 ($Xhi,$Xi); 583$code.=<<___; 584 pshufd \$0b01001110,$T3,$T1 585 pshufd \$0b01001110,$Xi,$T2 586 pxor $T3,$T1 # Karatsuba pre-processing 587 movdqu $T3,0x30($Htbl) # save H^3 588 pxor $Xi,$T2 # Karatsuba pre-processing 589 movdqu $Xi,0x40($Htbl) # save H^4 590 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... 591 movdqu $T2,0x50($Htbl) # save Karatsuba "salt" 592___ 593} 594$code.=<<___ if ($win64); 595 movaps (%rsp),%xmm6 596 lea 0x18(%rsp),%rsp 597.LSEH_end_gcm_init_clmul: 598___ 599$code.=<<___; 600 ret 601.size gcm_init_clmul,.-gcm_init_clmul 602___ 603} 604 605{ my ($Xip,$Htbl)=@_4args; 606 607$code.=<<___; 608.globl gcm_gmult_clmul 609.type gcm_gmult_clmul,\@abi-omnipotent 610.align 16 611gcm_gmult_clmul: 612.L_gmult_clmul: 613 movdqu ($Xip),$Xi 614 movdqa .Lbswap_mask(%rip),$T3 615 movdqu ($Htbl),$Hkey 616 movdqu 0x20($Htbl),$T2 617 pshufb $T3,$Xi 618___ 619 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); 620$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); 621 # experimental alternative. special thing about is that there 622 # no dependency between the two multiplications... 623 mov \$`0xE1<<1`,%eax 624 mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff 625 mov \$0x07,%r11d 626 movq %rax,$T1 627 movq %r10,$T2 628 movq %r11,$T3 # borrow $T3 629 pand $Xi,$T3 630 pshufb $T3,$T2 # ($Xi&7)·0xE0 631 movq %rax,$T3 632 pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) 633 pxor $Xi,$T2 634 pslldq \$15,$T2 635 paddd $T2,$T2 # <<(64+56+1) 636 pxor $T2,$Xi 637 pclmulqdq \$0x01,$T3,$Xi 638 movdqa .Lbswap_mask(%rip),$T3 # reload $T3 639 psrldq \$1,$T1 640 pxor $T1,$Xhi 641 pslldq \$7,$Xi 642 pxor $Xhi,$Xi 643___ 644$code.=<<___; 645 pshufb $T3,$Xi 646 movdqu $Xi,($Xip) 647 ret 648.size gcm_gmult_clmul,.-gcm_gmult_clmul 649___ 650} 651 652{ my ($Xip,$Htbl,$inp,$len)=@_4args; 653 my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); 654 my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); 655 656$code.=<<___; 657.globl gcm_ghash_clmul 658.type gcm_ghash_clmul,\@abi-omnipotent 659.align 32 660gcm_ghash_clmul: 661.L_ghash_clmul: 662___ 663$code.=<<___ if ($win64); 664 lea -0x88(%rsp),%rax 665.LSEH_begin_gcm_ghash_clmul: 666 # I can't trust assembler to use specific encoding:-( 667 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 668 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 669 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 670 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 671 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 672 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 673 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 674 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 675 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 676 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 677 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 678___ 679$code.=<<___; 680 movdqa .Lbswap_mask(%rip),$T3 681 682 movdqu ($Xip),$Xi 683 movdqu ($Htbl),$Hkey 684 movdqu 0x20($Htbl),$HK 685 pshufb $T3,$Xi 686 687 sub \$0x10,$len 688 jz .Lodd_tail 689 690 movdqu 0x10($Htbl),$Hkey2 691___ 692if ($do4xaggr) { 693my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); 694 695$code.=<<___; 696 mov OPENSSL_ia32cap_P+4(%rip),%eax 697 cmp \$0x30,$len 698 jb .Lskip4x 699 700 and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE 701 cmp \$`1<<22`,%eax # check for MOVBE without XSAVE 702 je .Lskip4x 703 704 sub \$0x30,$len 705 mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff 706 movdqu 0x30($Htbl),$Hkey3 707 movdqu 0x40($Htbl),$Hkey4 708 709 ####### 710 # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P 711 # 712 movdqu 0x30($inp),$Xln 713 movdqu 0x20($inp),$Xl 714 pshufb $T3,$Xln 715 pshufb $T3,$Xl 716 movdqa $Xln,$Xhn 717 pshufd \$0b01001110,$Xln,$Xmn 718 pxor $Xln,$Xmn 719 pclmulqdq \$0x00,$Hkey,$Xln 720 pclmulqdq \$0x11,$Hkey,$Xhn 721 pclmulqdq \$0x00,$HK,$Xmn 722 723 movdqa $Xl,$Xh 724 pshufd \$0b01001110,$Xl,$Xm 725 pxor $Xl,$Xm 726 pclmulqdq \$0x00,$Hkey2,$Xl 727 pclmulqdq \$0x11,$Hkey2,$Xh 728 pclmulqdq \$0x10,$HK,$Xm 729 xorps $Xl,$Xln 730 xorps $Xh,$Xhn 731 movups 0x50($Htbl),$HK 732 xorps $Xm,$Xmn 733 734 movdqu 0x10($inp),$Xl 735 movdqu 0($inp),$T1 736 pshufb $T3,$Xl 737 pshufb $T3,$T1 738 movdqa $Xl,$Xh 739 pshufd \$0b01001110,$Xl,$Xm 740 pxor $T1,$Xi 741 pxor $Xl,$Xm 742 pclmulqdq \$0x00,$Hkey3,$Xl 743 movdqa $Xi,$Xhi 744 pshufd \$0b01001110,$Xi,$T1 745 pxor $Xi,$T1 746 pclmulqdq \$0x11,$Hkey3,$Xh 747 pclmulqdq \$0x00,$HK,$Xm 748 xorps $Xl,$Xln 749 xorps $Xh,$Xhn 750 751 lea 0x40($inp),$inp 752 sub \$0x40,$len 753 jc .Ltail4x 754 755 jmp .Lmod4_loop 756.align 32 757.Lmod4_loop: 758 pclmulqdq \$0x00,$Hkey4,$Xi 759 xorps $Xm,$Xmn 760 movdqu 0x30($inp),$Xl 761 pshufb $T3,$Xl 762 pclmulqdq \$0x11,$Hkey4,$Xhi 763 xorps $Xln,$Xi 764 movdqu 0x20($inp),$Xln 765 movdqa $Xl,$Xh 766 pclmulqdq \$0x10,$HK,$T1 767 pshufd \$0b01001110,$Xl,$Xm 768 xorps $Xhn,$Xhi 769 pxor $Xl,$Xm 770 pshufb $T3,$Xln 771 movups 0x20($Htbl),$HK 772 xorps $Xmn,$T1 773 pclmulqdq \$0x00,$Hkey,$Xl 774 pshufd \$0b01001110,$Xln,$Xmn 775 776 pxor $Xi,$T1 # aggregated Karatsuba post-processing 777 movdqa $Xln,$Xhn 778 pxor $Xhi,$T1 # 779 pxor $Xln,$Xmn 780 movdqa $T1,$T2 # 781 pclmulqdq \$0x11,$Hkey,$Xh 782 pslldq \$8,$T1 783 psrldq \$8,$T2 # 784 pxor $T1,$Xi 785 movdqa .L7_mask(%rip),$T1 786 pxor $T2,$Xhi # 787 movq %rax,$T2 788 789 pand $Xi,$T1 # 1st phase 790 pshufb $T1,$T2 # 791 pxor $Xi,$T2 # 792 pclmulqdq \$0x00,$HK,$Xm 793 psllq \$57,$T2 # 794 movdqa $T2,$T1 # 795 pslldq \$8,$T2 796 pclmulqdq \$0x00,$Hkey2,$Xln 797 psrldq \$8,$T1 # 798 pxor $T2,$Xi 799 pxor $T1,$Xhi # 800 movdqu 0($inp),$T1 801 802 movdqa $Xi,$T2 # 2nd phase 803 psrlq \$1,$Xi 804 pclmulqdq \$0x11,$Hkey2,$Xhn 805 xorps $Xl,$Xln 806 movdqu 0x10($inp),$Xl 807 pshufb $T3,$Xl 808 pclmulqdq \$0x10,$HK,$Xmn 809 xorps $Xh,$Xhn 810 movups 0x50($Htbl),$HK 811 pshufb $T3,$T1 812 pxor $T2,$Xhi # 813 pxor $Xi,$T2 814 psrlq \$5,$Xi 815 816 movdqa $Xl,$Xh 817 pxor $Xm,$Xmn 818 pshufd \$0b01001110,$Xl,$Xm 819 pxor $T2,$Xi # 820 pxor $T1,$Xhi 821 pxor $Xl,$Xm 822 pclmulqdq \$0x00,$Hkey3,$Xl 823 psrlq \$1,$Xi # 824 pxor $Xhi,$Xi # 825 movdqa $Xi,$Xhi 826 pclmulqdq \$0x11,$Hkey3,$Xh 827 xorps $Xl,$Xln 828 pshufd \$0b01001110,$Xi,$T1 829 pxor $Xi,$T1 830 831 pclmulqdq \$0x00,$HK,$Xm 832 xorps $Xh,$Xhn 833 834 lea 0x40($inp),$inp 835 sub \$0x40,$len 836 jnc .Lmod4_loop 837 838.Ltail4x: 839 pclmulqdq \$0x00,$Hkey4,$Xi 840 pclmulqdq \$0x11,$Hkey4,$Xhi 841 pclmulqdq \$0x10,$HK,$T1 842 xorps $Xm,$Xmn 843 xorps $Xln,$Xi 844 xorps $Xhn,$Xhi 845 pxor $Xi,$Xhi # aggregated Karatsuba post-processing 846 pxor $Xmn,$T1 847 848 pxor $Xhi,$T1 # 849 pxor $Xi,$Xhi 850 851 movdqa $T1,$T2 # 852 psrldq \$8,$T1 853 pslldq \$8,$T2 # 854 pxor $T1,$Xhi 855 pxor $T2,$Xi # 856___ 857 &reduction_alg9($Xhi,$Xi); 858$code.=<<___; 859 add \$0x40,$len 860 jz .Ldone 861 movdqu 0x20($Htbl),$HK 862 sub \$0x10,$len 863 jz .Lodd_tail 864.Lskip4x: 865___ 866} 867$code.=<<___; 868 ####### 869 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 870 # [(H*Ii+1) + (H*Xi+1)] mod P = 871 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 872 # 873 movdqu ($inp),$T1 # Ii 874 movdqu 16($inp),$Xln # Ii+1 875 pshufb $T3,$T1 876 pshufb $T3,$Xln 877 pxor $T1,$Xi # Ii+Xi 878 879 movdqa $Xln,$Xhn 880 pshufd \$0b01001110,$Xln,$Xmn 881 pxor $Xln,$Xmn 882 pclmulqdq \$0x00,$Hkey,$Xln 883 pclmulqdq \$0x11,$Hkey,$Xhn 884 pclmulqdq \$0x00,$HK,$Xmn 885 886 lea 32($inp),$inp # i+=2 887 nop 888 sub \$0x20,$len 889 jbe .Leven_tail 890 nop 891 jmp .Lmod_loop 892 893.align 32 894.Lmod_loop: 895 movdqa $Xi,$Xhi 896 movdqa $Xmn,$T1 897 pshufd \$0b01001110,$Xi,$Xmn # 898 pxor $Xi,$Xmn # 899 900 pclmulqdq \$0x00,$Hkey2,$Xi 901 pclmulqdq \$0x11,$Hkey2,$Xhi 902 pclmulqdq \$0x10,$HK,$Xmn 903 904 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 905 pxor $Xhn,$Xhi 906 movdqu ($inp),$T2 # Ii 907 pxor $Xi,$T1 # aggregated Karatsuba post-processing 908 pshufb $T3,$T2 909 movdqu 16($inp),$Xln # Ii+1 910 911 pxor $Xhi,$T1 912 pxor $T2,$Xhi # "Ii+Xi", consume early 913 pxor $T1,$Xmn 914 pshufb $T3,$Xln 915 movdqa $Xmn,$T1 # 916 psrldq \$8,$T1 917 pslldq \$8,$Xmn # 918 pxor $T1,$Xhi 919 pxor $Xmn,$Xi # 920 921 movdqa $Xln,$Xhn # 922 923 movdqa $Xi,$T2 # 1st phase 924 movdqa $Xi,$T1 925 psllq \$5,$Xi 926 pxor $Xi,$T1 # 927 pclmulqdq \$0x00,$Hkey,$Xln ####### 928 psllq \$1,$Xi 929 pxor $T1,$Xi # 930 psllq \$57,$Xi # 931 movdqa $Xi,$T1 # 932 pslldq \$8,$Xi 933 psrldq \$8,$T1 # 934 pxor $T2,$Xi 935 pshufd \$0b01001110,$Xhn,$Xmn 936 pxor $T1,$Xhi # 937 pxor $Xhn,$Xmn # 938 939 movdqa $Xi,$T2 # 2nd phase 940 psrlq \$1,$Xi 941 pclmulqdq \$0x11,$Hkey,$Xhn ####### 942 pxor $T2,$Xhi # 943 pxor $Xi,$T2 944 psrlq \$5,$Xi 945 pxor $T2,$Xi # 946 lea 32($inp),$inp 947 psrlq \$1,$Xi # 948 pclmulqdq \$0x00,$HK,$Xmn ####### 949 pxor $Xhi,$Xi # 950 951 sub \$0x20,$len 952 ja .Lmod_loop 953 954.Leven_tail: 955 movdqa $Xi,$Xhi 956 movdqa $Xmn,$T1 957 pshufd \$0b01001110,$Xi,$Xmn # 958 pxor $Xi,$Xmn # 959 960 pclmulqdq \$0x00,$Hkey2,$Xi 961 pclmulqdq \$0x11,$Hkey2,$Xhi 962 pclmulqdq \$0x10,$HK,$Xmn 963 964 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 965 pxor $Xhn,$Xhi 966 pxor $Xi,$T1 967 pxor $Xhi,$T1 968 pxor $T1,$Xmn 969 movdqa $Xmn,$T1 # 970 psrldq \$8,$T1 971 pslldq \$8,$Xmn # 972 pxor $T1,$Xhi 973 pxor $Xmn,$Xi # 974___ 975 &reduction_alg9 ($Xhi,$Xi); 976$code.=<<___; 977 test $len,$len 978 jnz .Ldone 979 980.Lodd_tail: 981 movdqu ($inp),$T1 # Ii 982 pshufb $T3,$T1 983 pxor $T1,$Xi # Ii+Xi 984___ 985 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) 986 &reduction_alg9 ($Xhi,$Xi); 987$code.=<<___; 988.Ldone: 989 pshufb $T3,$Xi 990 movdqu $Xi,($Xip) 991___ 992$code.=<<___ if ($win64); 993 movaps (%rsp),%xmm6 994 movaps 0x10(%rsp),%xmm7 995 movaps 0x20(%rsp),%xmm8 996 movaps 0x30(%rsp),%xmm9 997 movaps 0x40(%rsp),%xmm10 998 movaps 0x50(%rsp),%xmm11 999 movaps 0x60(%rsp),%xmm12 1000 movaps 0x70(%rsp),%xmm13 1001 movaps 0x80(%rsp),%xmm14 1002 movaps 0x90(%rsp),%xmm15 1003 lea 0xa8(%rsp),%rsp 1004.LSEH_end_gcm_ghash_clmul: 1005___ 1006$code.=<<___; 1007 ret 1008.size gcm_ghash_clmul,.-gcm_ghash_clmul 1009___ 1010} 1011 1012$code.=<<___; 1013.globl gcm_init_avx 1014.type gcm_init_avx,\@abi-omnipotent 1015.align 32 1016gcm_init_avx: 1017___ 1018if ($avx) { 1019my ($Htbl,$Xip)=@_4args; 1020my $HK="%xmm6"; 1021 1022$code.=<<___ if ($win64); 1023.LSEH_begin_gcm_init_avx: 1024 # I can't trust assembler to use specific encoding:-( 1025 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 1026 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 1027___ 1028$code.=<<___; 1029 vzeroupper 1030 1031 vmovdqu ($Xip),$Hkey 1032 vpshufd \$0b01001110,$Hkey,$Hkey # dword swap 1033 1034 # <<1 twist 1035 vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 1036 vpsrlq \$63,$Hkey,$T1 1037 vpsllq \$1,$Hkey,$Hkey 1038 vpxor $T3,$T3,$T3 # 1039 vpcmpgtd $T2,$T3,$T3 # broadcast carry bit 1040 vpslldq \$8,$T1,$T1 1041 vpor $T1,$Hkey,$Hkey # H<<=1 1042 1043 # magic reduction 1044 vpand .L0x1c2_polynomial(%rip),$T3,$T3 1045 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial 1046 1047 vpunpckhqdq $Hkey,$Hkey,$HK 1048 vmovdqa $Hkey,$Xi 1049 vpxor $Hkey,$HK,$HK 1050 mov \$4,%r10 # up to H^8 1051 jmp .Linit_start_avx 1052___ 1053 1054sub clmul64x64_avx { 1055my ($Xhi,$Xi,$Hkey,$HK)=@_; 1056 1057if (!defined($HK)) { $HK = $T2; 1058$code.=<<___; 1059 vpunpckhqdq $Xi,$Xi,$T1 1060 vpunpckhqdq $Hkey,$Hkey,$T2 1061 vpxor $Xi,$T1,$T1 # 1062 vpxor $Hkey,$T2,$T2 1063___ 1064} else { 1065$code.=<<___; 1066 vpunpckhqdq $Xi,$Xi,$T1 1067 vpxor $Xi,$T1,$T1 # 1068___ 1069} 1070$code.=<<___; 1071 vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### 1072 vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### 1073 vpclmulqdq \$0x00,$HK,$T1,$T1 ####### 1074 vpxor $Xi,$Xhi,$T2 # 1075 vpxor $T2,$T1,$T1 # 1076 1077 vpslldq \$8,$T1,$T2 # 1078 vpsrldq \$8,$T1,$T1 1079 vpxor $T2,$Xi,$Xi # 1080 vpxor $T1,$Xhi,$Xhi 1081___ 1082} 1083 1084sub reduction_avx { 1085my ($Xhi,$Xi) = @_; 1086 1087$code.=<<___; 1088 vpsllq \$57,$Xi,$T1 # 1st phase 1089 vpsllq \$62,$Xi,$T2 1090 vpxor $T1,$T2,$T2 # 1091 vpsllq \$63,$Xi,$T1 1092 vpxor $T1,$T2,$T2 # 1093 vpslldq \$8,$T2,$T1 # 1094 vpsrldq \$8,$T2,$T2 1095 vpxor $T1,$Xi,$Xi # 1096 vpxor $T2,$Xhi,$Xhi 1097 1098 vpsrlq \$1,$Xi,$T2 # 2nd phase 1099 vpxor $Xi,$Xhi,$Xhi 1100 vpxor $T2,$Xi,$Xi # 1101 vpsrlq \$5,$T2,$T2 1102 vpxor $T2,$Xi,$Xi # 1103 vpsrlq \$1,$Xi,$Xi # 1104 vpxor $Xhi,$Xi,$Xi # 1105___ 1106} 1107 1108$code.=<<___; 1109.align 32 1110.Linit_loop_avx: 1111 vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... 1112 vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" 1113___ 1114 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 1115 &reduction_avx ($Xhi,$Xi); 1116$code.=<<___; 1117.Linit_start_avx: 1118 vmovdqa $Xi,$T3 1119___ 1120 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 1121 &reduction_avx ($Xhi,$Xi); 1122$code.=<<___; 1123 vpshufd \$0b01001110,$T3,$T1 1124 vpshufd \$0b01001110,$Xi,$T2 1125 vpxor $T3,$T1,$T1 # Karatsuba pre-processing 1126 vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 1127 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing 1128 vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 1129 lea 0x30($Htbl),$Htbl 1130 sub \$1,%r10 1131 jnz .Linit_loop_avx 1132 1133 vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped 1134 vmovdqu $T3,-0x10($Htbl) 1135 1136 vzeroupper 1137___ 1138$code.=<<___ if ($win64); 1139 movaps (%rsp),%xmm6 1140 lea 0x18(%rsp),%rsp 1141.LSEH_end_gcm_init_avx: 1142___ 1143$code.=<<___; 1144 ret 1145.size gcm_init_avx,.-gcm_init_avx 1146___ 1147} else { 1148$code.=<<___; 1149 jmp .L_init_clmul 1150.size gcm_init_avx,.-gcm_init_avx 1151___ 1152} 1153 1154$code.=<<___; 1155.globl gcm_gmult_avx 1156.type gcm_gmult_avx,\@abi-omnipotent 1157.align 32 1158gcm_gmult_avx: 1159 jmp .L_gmult_clmul 1160.size gcm_gmult_avx,.-gcm_gmult_avx 1161___ 1162 1163$code.=<<___; 1164.globl gcm_ghash_avx 1165.type gcm_ghash_avx,\@abi-omnipotent 1166.align 32 1167gcm_ghash_avx: 1168___ 1169if ($avx) { 1170my ($Xip,$Htbl,$inp,$len)=@_4args; 1171my ($Xlo,$Xhi,$Xmi, 1172 $Zlo,$Zhi,$Zmi, 1173 $Hkey,$HK,$T1,$T2, 1174 $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); 1175 1176$code.=<<___ if ($win64); 1177 lea -0x88(%rsp),%rax 1178.LSEH_begin_gcm_ghash_avx: 1179 # I can't trust assembler to use specific encoding:-( 1180 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 1181 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 1182 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 1183 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 1184 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 1185 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 1186 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 1187 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 1188 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 1189 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 1190 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 1191___ 1192$code.=<<___; 1193 vzeroupper 1194 1195 vmovdqu ($Xip),$Xi # load $Xi 1196 lea .L0x1c2_polynomial(%rip),%r10 1197 lea 0x40($Htbl),$Htbl # size optimization 1198 vmovdqu .Lbswap_mask(%rip),$bswap 1199 vpshufb $bswap,$Xi,$Xi 1200 cmp \$0x80,$len 1201 jb .Lshort_avx 1202 sub \$0x80,$len 1203 1204 vmovdqu 0x70($inp),$Ii # I[7] 1205 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1206 vpshufb $bswap,$Ii,$Ii 1207 vmovdqu 0x20-0x40($Htbl),$HK 1208 1209 vpunpckhqdq $Ii,$Ii,$T2 1210 vmovdqu 0x60($inp),$Ij # I[6] 1211 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1212 vpxor $Ii,$T2,$T2 1213 vpshufb $bswap,$Ij,$Ij 1214 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1215 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1216 vpunpckhqdq $Ij,$Ij,$T1 1217 vmovdqu 0x50($inp),$Ii # I[5] 1218 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1219 vpxor $Ij,$T1,$T1 1220 1221 vpshufb $bswap,$Ii,$Ii 1222 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1223 vpunpckhqdq $Ii,$Ii,$T2 1224 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1225 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1226 vpxor $Ii,$T2,$T2 1227 vmovdqu 0x40($inp),$Ij # I[4] 1228 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1229 vmovdqu 0x50-0x40($Htbl),$HK 1230 1231 vpshufb $bswap,$Ij,$Ij 1232 vpxor $Xlo,$Zlo,$Zlo 1233 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1234 vpxor $Xhi,$Zhi,$Zhi 1235 vpunpckhqdq $Ij,$Ij,$T1 1236 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1237 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1238 vpxor $Xmi,$Zmi,$Zmi 1239 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1240 vpxor $Ij,$T1,$T1 1241 1242 vmovdqu 0x30($inp),$Ii # I[3] 1243 vpxor $Zlo,$Xlo,$Xlo 1244 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1245 vpxor $Zhi,$Xhi,$Xhi 1246 vpshufb $bswap,$Ii,$Ii 1247 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1248 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1249 vpxor $Zmi,$Xmi,$Xmi 1250 vpunpckhqdq $Ii,$Ii,$T2 1251 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1252 vmovdqu 0x80-0x40($Htbl),$HK 1253 vpxor $Ii,$T2,$T2 1254 1255 vmovdqu 0x20($inp),$Ij # I[2] 1256 vpxor $Xlo,$Zlo,$Zlo 1257 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1258 vpxor $Xhi,$Zhi,$Zhi 1259 vpshufb $bswap,$Ij,$Ij 1260 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1261 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1262 vpxor $Xmi,$Zmi,$Zmi 1263 vpunpckhqdq $Ij,$Ij,$T1 1264 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1265 vpxor $Ij,$T1,$T1 1266 1267 vmovdqu 0x10($inp),$Ii # I[1] 1268 vpxor $Zlo,$Xlo,$Xlo 1269 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1270 vpxor $Zhi,$Xhi,$Xhi 1271 vpshufb $bswap,$Ii,$Ii 1272 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1273 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1274 vpxor $Zmi,$Xmi,$Xmi 1275 vpunpckhqdq $Ii,$Ii,$T2 1276 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1277 vmovdqu 0xb0-0x40($Htbl),$HK 1278 vpxor $Ii,$T2,$T2 1279 1280 vmovdqu ($inp),$Ij # I[0] 1281 vpxor $Xlo,$Zlo,$Zlo 1282 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1283 vpxor $Xhi,$Zhi,$Zhi 1284 vpshufb $bswap,$Ij,$Ij 1285 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1286 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1287 vpxor $Xmi,$Zmi,$Zmi 1288 vpclmulqdq \$0x10,$HK,$T2,$Xmi 1289 1290 lea 0x80($inp),$inp 1291 cmp \$0x80,$len 1292 jb .Ltail_avx 1293 1294 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1295 sub \$0x80,$len 1296 jmp .Loop8x_avx 1297 1298.align 32 1299.Loop8x_avx: 1300 vpunpckhqdq $Ij,$Ij,$T1 1301 vmovdqu 0x70($inp),$Ii # I[7] 1302 vpxor $Xlo,$Zlo,$Zlo 1303 vpxor $Ij,$T1,$T1 1304 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi 1305 vpshufb $bswap,$Ii,$Ii 1306 vpxor $Xhi,$Zhi,$Zhi 1307 vpclmulqdq \$0x11,$Hkey,$Ij,$Xo 1308 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1309 vpunpckhqdq $Ii,$Ii,$T2 1310 vpxor $Xmi,$Zmi,$Zmi 1311 vpclmulqdq \$0x00,$HK,$T1,$Tred 1312 vmovdqu 0x20-0x40($Htbl),$HK 1313 vpxor $Ii,$T2,$T2 1314 1315 vmovdqu 0x60($inp),$Ij # I[6] 1316 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1317 vpxor $Zlo,$Xi,$Xi # collect result 1318 vpshufb $bswap,$Ij,$Ij 1319 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1320 vxorps $Zhi,$Xo,$Xo 1321 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1322 vpunpckhqdq $Ij,$Ij,$T1 1323 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1324 vpxor $Zmi,$Tred,$Tred 1325 vxorps $Ij,$T1,$T1 1326 1327 vmovdqu 0x50($inp),$Ii # I[5] 1328 vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing 1329 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1330 vpxor $Xo,$Tred,$Tred 1331 vpslldq \$8,$Tred,$T2 1332 vpxor $Xlo,$Zlo,$Zlo 1333 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1334 vpsrldq \$8,$Tred,$Tred 1335 vpxor $T2, $Xi, $Xi 1336 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1337 vpshufb $bswap,$Ii,$Ii 1338 vxorps $Tred,$Xo, $Xo 1339 vpxor $Xhi,$Zhi,$Zhi 1340 vpunpckhqdq $Ii,$Ii,$T2 1341 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1342 vmovdqu 0x50-0x40($Htbl),$HK 1343 vpxor $Ii,$T2,$T2 1344 vpxor $Xmi,$Zmi,$Zmi 1345 1346 vmovdqu 0x40($inp),$Ij # I[4] 1347 vpalignr \$8,$Xi,$Xi,$Tred # 1st phase 1348 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1349 vpshufb $bswap,$Ij,$Ij 1350 vpxor $Zlo,$Xlo,$Xlo 1351 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1352 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1353 vpunpckhqdq $Ij,$Ij,$T1 1354 vpxor $Zhi,$Xhi,$Xhi 1355 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1356 vxorps $Ij,$T1,$T1 1357 vpxor $Zmi,$Xmi,$Xmi 1358 1359 vmovdqu 0x30($inp),$Ii # I[3] 1360 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1361 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1362 vpshufb $bswap,$Ii,$Ii 1363 vpxor $Xlo,$Zlo,$Zlo 1364 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1365 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1366 vpunpckhqdq $Ii,$Ii,$T2 1367 vpxor $Xhi,$Zhi,$Zhi 1368 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1369 vmovdqu 0x80-0x40($Htbl),$HK 1370 vpxor $Ii,$T2,$T2 1371 vpxor $Xmi,$Zmi,$Zmi 1372 1373 vmovdqu 0x20($inp),$Ij # I[2] 1374 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1375 vpshufb $bswap,$Ij,$Ij 1376 vpxor $Zlo,$Xlo,$Xlo 1377 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1378 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1379 vpunpckhqdq $Ij,$Ij,$T1 1380 vpxor $Zhi,$Xhi,$Xhi 1381 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1382 vpxor $Ij,$T1,$T1 1383 vpxor $Zmi,$Xmi,$Xmi 1384 vxorps $Tred,$Xi,$Xi 1385 1386 vmovdqu 0x10($inp),$Ii # I[1] 1387 vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase 1388 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1389 vpshufb $bswap,$Ii,$Ii 1390 vpxor $Xlo,$Zlo,$Zlo 1391 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1392 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1393 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1394 vxorps $Xo,$Tred,$Tred 1395 vpunpckhqdq $Ii,$Ii,$T2 1396 vpxor $Xhi,$Zhi,$Zhi 1397 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1398 vmovdqu 0xb0-0x40($Htbl),$HK 1399 vpxor $Ii,$T2,$T2 1400 vpxor $Xmi,$Zmi,$Zmi 1401 1402 vmovdqu ($inp),$Ij # I[0] 1403 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1404 vpshufb $bswap,$Ij,$Ij 1405 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1406 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1407 vpxor $Tred,$Ij,$Ij 1408 vpclmulqdq \$0x10,$HK, $T2,$Xmi 1409 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1410 1411 lea 0x80($inp),$inp 1412 sub \$0x80,$len 1413 jnc .Loop8x_avx 1414 1415 add \$0x80,$len 1416 jmp .Ltail_no_xor_avx 1417 1418.align 32 1419.Lshort_avx: 1420 vmovdqu -0x10($inp,$len),$Ii # very last word 1421 lea ($inp,$len),$inp 1422 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1423 vmovdqu 0x20-0x40($Htbl),$HK 1424 vpshufb $bswap,$Ii,$Ij 1425 1426 vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, 1427 vmovdqa $Xhi,$Zhi # $Zhi and 1428 vmovdqa $Xmi,$Zmi # $Zmi 1429 sub \$0x10,$len 1430 jz .Ltail_avx 1431 1432 vpunpckhqdq $Ij,$Ij,$T1 1433 vpxor $Xlo,$Zlo,$Zlo 1434 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1435 vpxor $Ij,$T1,$T1 1436 vmovdqu -0x20($inp),$Ii 1437 vpxor $Xhi,$Zhi,$Zhi 1438 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1439 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1440 vpshufb $bswap,$Ii,$Ij 1441 vpxor $Xmi,$Zmi,$Zmi 1442 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1443 vpsrldq \$8,$HK,$HK 1444 sub \$0x10,$len 1445 jz .Ltail_avx 1446 1447 vpunpckhqdq $Ij,$Ij,$T1 1448 vpxor $Xlo,$Zlo,$Zlo 1449 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1450 vpxor $Ij,$T1,$T1 1451 vmovdqu -0x30($inp),$Ii 1452 vpxor $Xhi,$Zhi,$Zhi 1453 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1454 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1455 vpshufb $bswap,$Ii,$Ij 1456 vpxor $Xmi,$Zmi,$Zmi 1457 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1458 vmovdqu 0x50-0x40($Htbl),$HK 1459 sub \$0x10,$len 1460 jz .Ltail_avx 1461 1462 vpunpckhqdq $Ij,$Ij,$T1 1463 vpxor $Xlo,$Zlo,$Zlo 1464 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1465 vpxor $Ij,$T1,$T1 1466 vmovdqu -0x40($inp),$Ii 1467 vpxor $Xhi,$Zhi,$Zhi 1468 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1469 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1470 vpshufb $bswap,$Ii,$Ij 1471 vpxor $Xmi,$Zmi,$Zmi 1472 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1473 vpsrldq \$8,$HK,$HK 1474 sub \$0x10,$len 1475 jz .Ltail_avx 1476 1477 vpunpckhqdq $Ij,$Ij,$T1 1478 vpxor $Xlo,$Zlo,$Zlo 1479 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1480 vpxor $Ij,$T1,$T1 1481 vmovdqu -0x50($inp),$Ii 1482 vpxor $Xhi,$Zhi,$Zhi 1483 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1484 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1485 vpshufb $bswap,$Ii,$Ij 1486 vpxor $Xmi,$Zmi,$Zmi 1487 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1488 vmovdqu 0x80-0x40($Htbl),$HK 1489 sub \$0x10,$len 1490 jz .Ltail_avx 1491 1492 vpunpckhqdq $Ij,$Ij,$T1 1493 vpxor $Xlo,$Zlo,$Zlo 1494 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1495 vpxor $Ij,$T1,$T1 1496 vmovdqu -0x60($inp),$Ii 1497 vpxor $Xhi,$Zhi,$Zhi 1498 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1499 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1500 vpshufb $bswap,$Ii,$Ij 1501 vpxor $Xmi,$Zmi,$Zmi 1502 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1503 vpsrldq \$8,$HK,$HK 1504 sub \$0x10,$len 1505 jz .Ltail_avx 1506 1507 vpunpckhqdq $Ij,$Ij,$T1 1508 vpxor $Xlo,$Zlo,$Zlo 1509 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1510 vpxor $Ij,$T1,$T1 1511 vmovdqu -0x70($inp),$Ii 1512 vpxor $Xhi,$Zhi,$Zhi 1513 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1514 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1515 vpshufb $bswap,$Ii,$Ij 1516 vpxor $Xmi,$Zmi,$Zmi 1517 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1518 vmovq 0xb8-0x40($Htbl),$HK 1519 sub \$0x10,$len 1520 jmp .Ltail_avx 1521 1522.align 32 1523.Ltail_avx: 1524 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1525.Ltail_no_xor_avx: 1526 vpunpckhqdq $Ij,$Ij,$T1 1527 vpxor $Xlo,$Zlo,$Zlo 1528 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1529 vpxor $Ij,$T1,$T1 1530 vpxor $Xhi,$Zhi,$Zhi 1531 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1532 vpxor $Xmi,$Zmi,$Zmi 1533 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1534 1535 vmovdqu (%r10),$Tred 1536 1537 vpxor $Xlo,$Zlo,$Xi 1538 vpxor $Xhi,$Zhi,$Xo 1539 vpxor $Xmi,$Zmi,$Zmi 1540 1541 vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing 1542 vpxor $Xo, $Zmi,$Zmi 1543 vpslldq \$8, $Zmi,$T2 1544 vpsrldq \$8, $Zmi,$Zmi 1545 vpxor $T2, $Xi, $Xi 1546 vpxor $Zmi,$Xo, $Xo 1547 1548 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase 1549 vpalignr \$8,$Xi,$Xi,$Xi 1550 vpxor $T2,$Xi,$Xi 1551 1552 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase 1553 vpalignr \$8,$Xi,$Xi,$Xi 1554 vpxor $Xo,$Xi,$Xi 1555 vpxor $T2,$Xi,$Xi 1556 1557 cmp \$0,$len 1558 jne .Lshort_avx 1559 1560 vpshufb $bswap,$Xi,$Xi 1561 vmovdqu $Xi,($Xip) 1562 vzeroupper 1563___ 1564$code.=<<___ if ($win64); 1565 movaps (%rsp),%xmm6 1566 movaps 0x10(%rsp),%xmm7 1567 movaps 0x20(%rsp),%xmm8 1568 movaps 0x30(%rsp),%xmm9 1569 movaps 0x40(%rsp),%xmm10 1570 movaps 0x50(%rsp),%xmm11 1571 movaps 0x60(%rsp),%xmm12 1572 movaps 0x70(%rsp),%xmm13 1573 movaps 0x80(%rsp),%xmm14 1574 movaps 0x90(%rsp),%xmm15 1575 lea 0xa8(%rsp),%rsp 1576.LSEH_end_gcm_ghash_avx: 1577___ 1578$code.=<<___; 1579 ret 1580.size gcm_ghash_avx,.-gcm_ghash_avx 1581___ 1582} else { 1583$code.=<<___; 1584 jmp .L_ghash_clmul 1585.size gcm_ghash_avx,.-gcm_ghash_avx 1586___ 1587} 1588 1589$code.=<<___; 1590.align 64 1591.Lbswap_mask: 1592 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1593.L0x1c2_polynomial: 1594 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1595.L7_mask: 1596 .long 7,0,7,0 1597.L7_mask_poly: 1598 .long 7,0,`0xE1<<1`,0 1599.align 64 1600.type .Lrem_4bit,\@object 1601.Lrem_4bit: 1602 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` 1603 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` 1604 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` 1605 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` 1606.type .Lrem_8bit,\@object 1607.Lrem_8bit: 1608 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E 1609 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E 1610 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E 1611 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E 1612 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E 1613 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E 1614 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E 1615 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E 1616 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE 1617 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE 1618 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE 1619 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE 1620 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E 1621 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E 1622 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE 1623 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE 1624 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E 1625 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E 1626 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E 1627 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E 1628 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E 1629 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E 1630 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E 1631 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E 1632 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE 1633 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE 1634 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE 1635 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE 1636 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E 1637 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E 1638 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE 1639 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE 1640 1641.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1642.align 64 1643___ 1644 1645# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1646# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1647if ($win64) { 1648$rec="%rcx"; 1649$frame="%rdx"; 1650$context="%r8"; 1651$disp="%r9"; 1652 1653$code.=<<___; 1654.extern __imp_RtlVirtualUnwind 1655.type se_handler,\@abi-omnipotent 1656.align 16 1657se_handler: 1658 push %rsi 1659 push %rdi 1660 push %rbx 1661 push %rbp 1662 push %r12 1663 push %r13 1664 push %r14 1665 push %r15 1666 pushfq 1667 sub \$64,%rsp 1668 1669 mov 120($context),%rax # pull context->Rax 1670 mov 248($context),%rbx # pull context->Rip 1671 1672 mov 8($disp),%rsi # disp->ImageBase 1673 mov 56($disp),%r11 # disp->HandlerData 1674 1675 mov 0(%r11),%r10d # HandlerData[0] 1676 lea (%rsi,%r10),%r10 # prologue label 1677 cmp %r10,%rbx # context->Rip<prologue label 1678 jb .Lin_prologue 1679 1680 mov 152($context),%rax # pull context->Rsp 1681 1682 mov 4(%r11),%r10d # HandlerData[1] 1683 lea (%rsi,%r10),%r10 # epilogue label 1684 cmp %r10,%rbx # context->Rip>=epilogue label 1685 jae .Lin_prologue 1686 1687 lea 48+280(%rax),%rax # adjust "rsp" 1688 1689 mov -8(%rax),%rbx 1690 mov -16(%rax),%rbp 1691 mov -24(%rax),%r12 1692 mov -32(%rax),%r13 1693 mov -40(%rax),%r14 1694 mov -48(%rax),%r15 1695 mov %rbx,144($context) # restore context->Rbx 1696 mov %rbp,160($context) # restore context->Rbp 1697 mov %r12,216($context) # restore context->R12 1698 mov %r13,224($context) # restore context->R13 1699 mov %r14,232($context) # restore context->R14 1700 mov %r15,240($context) # restore context->R15 1701 1702.Lin_prologue: 1703 mov 8(%rax),%rdi 1704 mov 16(%rax),%rsi 1705 mov %rax,152($context) # restore context->Rsp 1706 mov %rsi,168($context) # restore context->Rsi 1707 mov %rdi,176($context) # restore context->Rdi 1708 1709 mov 40($disp),%rdi # disp->ContextRecord 1710 mov $context,%rsi # context 1711 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 1712 .long 0xa548f3fc # cld; rep movsq 1713 1714 mov $disp,%rsi 1715 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1716 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1717 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1718 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1719 mov 40(%rsi),%r10 # disp->ContextRecord 1720 lea 56(%rsi),%r11 # &disp->HandlerData 1721 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1722 mov %r10,32(%rsp) # arg5 1723 mov %r11,40(%rsp) # arg6 1724 mov %r12,48(%rsp) # arg7 1725 mov %rcx,56(%rsp) # arg8, (NULL) 1726 call *__imp_RtlVirtualUnwind(%rip) 1727 1728 mov \$1,%eax # ExceptionContinueSearch 1729 add \$64,%rsp 1730 popfq 1731 pop %r15 1732 pop %r14 1733 pop %r13 1734 pop %r12 1735 pop %rbp 1736 pop %rbx 1737 pop %rdi 1738 pop %rsi 1739 ret 1740.size se_handler,.-se_handler 1741 1742.section .pdata 1743.align 4 1744 .rva .LSEH_begin_gcm_gmult_4bit 1745 .rva .LSEH_end_gcm_gmult_4bit 1746 .rva .LSEH_info_gcm_gmult_4bit 1747 1748 .rva .LSEH_begin_gcm_ghash_4bit 1749 .rva .LSEH_end_gcm_ghash_4bit 1750 .rva .LSEH_info_gcm_ghash_4bit 1751 1752 .rva .LSEH_begin_gcm_init_clmul 1753 .rva .LSEH_end_gcm_init_clmul 1754 .rva .LSEH_info_gcm_init_clmul 1755 1756 .rva .LSEH_begin_gcm_ghash_clmul 1757 .rva .LSEH_end_gcm_ghash_clmul 1758 .rva .LSEH_info_gcm_ghash_clmul 1759___ 1760$code.=<<___ if ($avx); 1761 .rva .LSEH_begin_gcm_init_avx 1762 .rva .LSEH_end_gcm_init_avx 1763 .rva .LSEH_info_gcm_init_clmul 1764 1765 .rva .LSEH_begin_gcm_ghash_avx 1766 .rva .LSEH_end_gcm_ghash_avx 1767 .rva .LSEH_info_gcm_ghash_clmul 1768___ 1769$code.=<<___; 1770.section .xdata 1771.align 8 1772.LSEH_info_gcm_gmult_4bit: 1773 .byte 9,0,0,0 1774 .rva se_handler 1775 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData 1776.LSEH_info_gcm_ghash_4bit: 1777 .byte 9,0,0,0 1778 .rva se_handler 1779 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData 1780.LSEH_info_gcm_init_clmul: 1781 .byte 0x01,0x08,0x03,0x00 1782 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1783 .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18 1784.LSEH_info_gcm_ghash_clmul: 1785 .byte 0x01,0x33,0x16,0x00 1786 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 1787 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 1788 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 1789 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 1790 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 1791 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 1792 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 1793 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 1794 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 1795 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1796 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 1797___ 1798} 1799 1800$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1801 1802print $code; 1803 1804close STDOUT; 1805