16249468aSthib#!/usr/bin/env perl 26249468aSthib# 36249468aSthib# ==================================================================== 46249468aSthib# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 56249468aSthib# project. The module is, however, dual licensed under OpenSSL and 66249468aSthib# CRYPTOGAMS licenses depending on where you obtain it. For further 76249468aSthib# details see http://www.openssl.org/~appro/cryptogams/. 86249468aSthib# ==================================================================== 96249468aSthib# 106249468aSthib# This module implements support for Intel AES-NI extension. In 116249468aSthib# OpenSSL context it's used with Intel engine, but can also be used as 126249468aSthib# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 136249468aSthib# details]. 145cdd308eSdjm# 155cdd308eSdjm# Performance. 165cdd308eSdjm# 175cdd308eSdjm# Given aes(enc|dec) instructions' latency asymptotic performance for 185cdd308eSdjm# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 195cdd308eSdjm# processed with 128-bit key. And given their throughput asymptotic 205cdd308eSdjm# performance for parallelizable modes is 1.25 cycles per byte. Being 215cdd308eSdjm# asymptotic limit it's not something you commonly achieve in reality, 225cdd308eSdjm# but how close does one get? Below are results collected for 235cdd308eSdjm# different modes and block sized. Pairs of numbers are for en-/ 245cdd308eSdjm# decryption. 255cdd308eSdjm# 265cdd308eSdjm# 16-byte 64-byte 256-byte 1-KB 8-KB 275cdd308eSdjm# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 285cdd308eSdjm# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 295cdd308eSdjm# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 305cdd308eSdjm# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 315cdd308eSdjm# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 325cdd308eSdjm# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 335cdd308eSdjm# 345cdd308eSdjm# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 355cdd308eSdjm# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 365cdd308eSdjm# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 375cdd308eSdjm# The results were collected with specially crafted speed.c benchmark 385cdd308eSdjm# in order to compare them with results reported in "Intel Advanced 395cdd308eSdjm# Encryption Standard (AES) New Instruction Set" White Paper Revision 405cdd308eSdjm# 3.0 dated May 2010. All above results are consistently better. This 415cdd308eSdjm# module also provides better performance for block sizes smaller than 425cdd308eSdjm# 128 bytes in points *not* represented in the above table. 435cdd308eSdjm# 445cdd308eSdjm# Looking at the results for 8-KB buffer. 455cdd308eSdjm# 465cdd308eSdjm# CFB and OFB results are far from the limit, because implementation 475cdd308eSdjm# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 485cdd308eSdjm# single-block aesni_encrypt, which is not the most optimal way to go. 495cdd308eSdjm# CBC encrypt result is unexpectedly high and there is no documented 505cdd308eSdjm# explanation for it. Seemingly there is a small penalty for feeding 515cdd308eSdjm# the result back to AES unit the way it's done in CBC mode. There is 525cdd308eSdjm# nothing one can do and the result appears optimal. CCM result is 535cdd308eSdjm# identical to CBC, because CBC-MAC is essentially CBC encrypt without 545cdd308eSdjm# saving output. CCM CTR "stays invisible," because it's neatly 5571743258Sjmc# interleaved with CBC-MAC. This provides ~30% improvement over 565cdd308eSdjm# "straghtforward" CCM implementation with CTR and CBC-MAC performed 575cdd308eSdjm# disjointly. Parallelizable modes practically achieve the theoretical 585cdd308eSdjm# limit. 595cdd308eSdjm# 605cdd308eSdjm# Looking at how results vary with buffer size. 615cdd308eSdjm# 625cdd308eSdjm# Curves are practically saturated at 1-KB buffer size. In most cases 635cdd308eSdjm# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 645cdd308eSdjm# CTR curve doesn't follow this pattern and is "slowest" changing one 655cdd308eSdjm# with "256-byte" result being 87% of "8-KB." This is because overhead 665cdd308eSdjm# in CTR mode is most computationally intensive. Small-block CCM 675cdd308eSdjm# decrypt is slower than encrypt, because first CTR and last CBC-MAC 685cdd308eSdjm# iterations can't be interleaved. 695cdd308eSdjm# 705cdd308eSdjm# Results for 192- and 256-bit keys. 715cdd308eSdjm# 725cdd308eSdjm# EVP-free results were observed to scale perfectly with number of 735cdd308eSdjm# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 745cdd308eSdjm# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 755cdd308eSdjm# are a tad smaller, because the above mentioned penalty biases all 765cdd308eSdjm# results by same constant value. In similar way function call 775cdd308eSdjm# overhead affects small-block performance, as well as OFB and CFB 785cdd308eSdjm# results. Differences are not large, most common coefficients are 795cdd308eSdjm# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 805cdd308eSdjm# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 815cdd308eSdjm 825cdd308eSdjm# January 2011 835cdd308eSdjm# 845cdd308eSdjm# While Westmere processor features 6 cycles latency for aes[enc|dec] 855cdd308eSdjm# instructions, which can be scheduled every second cycle, Sandy 865cdd308eSdjm# Bridge spends 8 cycles per instruction, but it can schedule them 875cdd308eSdjm# every cycle. This means that code targeting Westmere would perform 885cdd308eSdjm# suboptimally on Sandy Bridge. Therefore this update. 895cdd308eSdjm# 905cdd308eSdjm# In addition, non-parallelizable CBC encrypt (as well as CCM) is 915cdd308eSdjm# optimized. Relative improvement might appear modest, 8% on Westmere, 925cdd308eSdjm# but in absolute terms it's 3.77 cycles per byte encrypted with 935cdd308eSdjm# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 945cdd308eSdjm# should be compared to asymptotic limits of 3.75 for Westmere and 955cdd308eSdjm# 5.00 for Sandy Bridge. Actually, the fact that they get this close 965cdd308eSdjm# to asymptotic limits is quite amazing. Indeed, the limit is 975cdd308eSdjm# calculated as latency times number of rounds, 10 for 128-bit key, 985cdd308eSdjm# and divided by 16, the number of bytes in block, or in other words 995cdd308eSdjm# it accounts *solely* for aesenc instructions. But there are extra 1005cdd308eSdjm# instructions, and numbers so close to the asymptotic limits mean 1015cdd308eSdjm# that it's as if it takes as little as *one* additional cycle to 1025cdd308eSdjm# execute all of them. How is it possible? It is possible thanks to 1035cdd308eSdjm# out-of-order execution logic, which manages to overlap post- 1045cdd308eSdjm# processing of previous block, things like saving the output, with 1055cdd308eSdjm# actual encryption of current block, as well as pre-processing of 1065cdd308eSdjm# current block, things like fetching input and xor-ing it with 1075cdd308eSdjm# 0-round element of the key schedule, with actual encryption of 1085cdd308eSdjm# previous block. Keep this in mind... 1095cdd308eSdjm# 1105cdd308eSdjm# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 1115cdd308eSdjm# performance is achieved by interleaving instructions working on 1125cdd308eSdjm# independent blocks. In which case asymptotic limit for such modes 1135cdd308eSdjm# can be obtained by dividing above mentioned numbers by AES 1145cdd308eSdjm# instructions' interleave factor. Westmere can execute at most 3 1155cdd308eSdjm# instructions at a time, meaning that optimal interleave factor is 3, 1165cdd308eSdjm# and that's where the "magic" number of 1.25 come from. "Optimal 1175cdd308eSdjm# interleave factor" means that increase of interleave factor does 1185cdd308eSdjm# not improve performance. The formula has proven to reflect reality 1195cdd308eSdjm# pretty well on Westmere... Sandy Bridge on the other hand can 1205cdd308eSdjm# execute up to 8 AES instructions at a time, so how does varying 1215cdd308eSdjm# interleave factor affect the performance? Here is table for ECB 1225cdd308eSdjm# (numbers are cycles per byte processed with 128-bit key): 1235cdd308eSdjm# 1245cdd308eSdjm# instruction interleave factor 3x 6x 8x 1255cdd308eSdjm# theoretical asymptotic limit 1.67 0.83 0.625 1265cdd308eSdjm# measured performance for 8KB block 1.05 0.86 0.84 1275cdd308eSdjm# 1285cdd308eSdjm# "as if" interleave factor 4.7x 5.8x 6.0x 1295cdd308eSdjm# 1305cdd308eSdjm# Further data for other parallelizable modes: 1315cdd308eSdjm# 1325cdd308eSdjm# CBC decrypt 1.16 0.93 0.93 1335cdd308eSdjm# CTR 1.14 0.91 n/a 1345cdd308eSdjm# 1355cdd308eSdjm# Well, given 3x column it's probably inappropriate to call the limit 1365cdd308eSdjm# asymptotic, if it can be surpassed, isn't it? What happens there? 1375cdd308eSdjm# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 1385cdd308eSdjm# magic is responsible for this. Processor overlaps not only the 13971743258Sjmc# additional instructions with AES ones, but even AES instructions 1405cdd308eSdjm# processing adjacent triplets of independent blocks. In the 6x case 1415cdd308eSdjm# additional instructions still claim disproportionally small amount 1425cdd308eSdjm# of additional cycles, but in 8x case number of instructions must be 1435cdd308eSdjm# a tad too high for out-of-order logic to cope with, and AES unit 1445cdd308eSdjm# remains underutilized... As you can see 8x interleave is hardly 1455cdd308eSdjm# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 1465cdd308eSdjm# utilizies 6x interleave because of limited register bank capacity. 1475cdd308eSdjm# 1485cdd308eSdjm# Higher interleave factors do have negative impact on Westmere 1495cdd308eSdjm# performance. While for ECB mode it's negligible ~1.5%, other 1505cdd308eSdjm# parallelizables perform ~5% worse, which is outweighed by ~25% 1515cdd308eSdjm# improvement on Sandy Bridge. To balance regression on Westmere 1525cdd308eSdjm# CTR mode was implemented with 6x aesenc interleave factor. 1535cdd308eSdjm 1545cdd308eSdjm# April 2011 1555cdd308eSdjm# 1565cdd308eSdjm# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing 1575cdd308eSdjm# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like 1585cdd308eSdjm# in CTR mode AES instruction interleave factor was chosen to be 6x. 1596249468aSthib 1606249468aSthib$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 1616249468aSthib # generates drop-in replacement for 1626249468aSthib # crypto/aes/asm/aes-x86_64.pl:-) 1636249468aSthib 1646249468aSthib$flavour = shift; 1656249468aSthib$output = shift; 1666249468aSthibif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 1676249468aSthib 1686249468aSthib$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 1696249468aSthib 1706249468aSthib$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 1716249468aSthib( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 1726249468aSthib( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 1736249468aSthibdie "can't locate x86_64-xlate.pl"; 1746249468aSthib 17597222eddSmiodopen OUT,"| \"$^X\" $xlate $flavour $output"; 17697222eddSmiod*STDOUT=*OUT; 1776249468aSthib 1785cdd308eSdjm$movkey = $PREFIX eq "aesni" ? "movups" : "movups"; 1796249468aSthib@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 1806249468aSthib ("%rdi","%rsi","%rdx","%rcx"); # Unix order 1816249468aSthib 1826249468aSthib$code=".text\n"; 1836249468aSthib 1846249468aSthib$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 1856249468aSthib# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 1866249468aSthib$inp="%rdi"; 1876249468aSthib$out="%rsi"; 1886249468aSthib$len="%rdx"; 1896249468aSthib$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 1905cdd308eSdjm$ivp="%r8"; # cbc, ctr, ... 1916249468aSthib 1926249468aSthib$rnds_="%r10d"; # backup copy for $rounds 1936249468aSthib$key_="%r11"; # backup copy for $key 1946249468aSthib 1956249468aSthib# %xmm register layout 1965cdd308eSdjm$rndkey0="%xmm0"; $rndkey1="%xmm1"; 1975cdd308eSdjm$inout0="%xmm2"; $inout1="%xmm3"; 1985cdd308eSdjm$inout2="%xmm4"; $inout3="%xmm5"; 1995cdd308eSdjm$inout4="%xmm6"; $inout5="%xmm7"; 2005cdd308eSdjm$inout6="%xmm8"; $inout7="%xmm9"; 2016249468aSthib 2025cdd308eSdjm$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 2035cdd308eSdjm$in0="%xmm8"; $iv="%xmm9"; 2046249468aSthib 2056249468aSthib# Inline version of internal aesni_[en|de]crypt1. 2066249468aSthib# 2076249468aSthib# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 2086249468aSthib# cycles which take care of loop variables... 2096249468aSthib{ my $sn; 2106249468aSthibsub aesni_generate1 { 2115cdd308eSdjmmy ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 2126249468aSthib++$sn; 2136249468aSthib$code.=<<___; 2146249468aSthib $movkey ($key),$rndkey0 2156249468aSthib $movkey 16($key),$rndkey1 2165cdd308eSdjm___ 2175cdd308eSdjm$code.=<<___ if (defined($ivec)); 2185cdd308eSdjm xorps $rndkey0,$ivec 2196249468aSthib lea 32($key),$key 2205cdd308eSdjm xorps $ivec,$inout 2215cdd308eSdjm___ 2225cdd308eSdjm$code.=<<___ if (!defined($ivec)); 2235cdd308eSdjm lea 32($key),$key 2245cdd308eSdjm xorps $rndkey0,$inout 2255cdd308eSdjm___ 2265cdd308eSdjm$code.=<<___; 2276249468aSthib.Loop_${p}1_$sn: 2285cdd308eSdjm aes${p} $rndkey1,$inout 2296249468aSthib dec $rounds 2306249468aSthib $movkey ($key),$rndkey1 2316249468aSthib lea 16($key),$key 2326249468aSthib jnz .Loop_${p}1_$sn # loop body is 16 bytes 2335cdd308eSdjm aes${p}last $rndkey1,$inout 2346249468aSthib___ 2356249468aSthib}} 2366249468aSthib# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 2376249468aSthib# 2386249468aSthib{ my ($inp,$out,$key) = @_4args; 2396249468aSthib 2406249468aSthib$code.=<<___; 2416249468aSthib.globl ${PREFIX}_encrypt 2426249468aSthib.type ${PREFIX}_encrypt,\@abi-omnipotent 2436249468aSthib.align 16 2446249468aSthib${PREFIX}_encrypt: 24522787c51Stb _CET_ENDBR 2466249468aSthib movups ($inp),$inout0 # load input 2475cdd308eSdjm mov 240($key),$rounds # key->rounds 2486249468aSthib___ 2496249468aSthib &aesni_generate1("enc",$key,$rounds); 2506249468aSthib$code.=<<___; 2516249468aSthib movups $inout0,($out) # output 2526249468aSthib ret 2536249468aSthib.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 2546249468aSthib 2556249468aSthib.globl ${PREFIX}_decrypt 2566249468aSthib.type ${PREFIX}_decrypt,\@abi-omnipotent 2576249468aSthib.align 16 2586249468aSthib${PREFIX}_decrypt: 25922787c51Stb _CET_ENDBR 2606249468aSthib movups ($inp),$inout0 # load input 2615cdd308eSdjm mov 240($key),$rounds # key->rounds 2626249468aSthib___ 2636249468aSthib &aesni_generate1("dec",$key,$rounds); 2646249468aSthib$code.=<<___; 2656249468aSthib movups $inout0,($out) # output 2666249468aSthib ret 2676249468aSthib.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 2686249468aSthib___ 2696249468aSthib} 2706249468aSthib 2715cdd308eSdjm# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 2725cdd308eSdjm# factor. Why 3x subroutine were originally used in loops? Even though 2735cdd308eSdjm# aes[enc|dec] latency was originally 6, it could be scheduled only 2745cdd308eSdjm# every *2nd* cycle. Thus 3x interleave was the one providing optimal 2756249468aSthib# utilization, i.e. when subroutine's throughput is virtually same as 2766249468aSthib# of non-interleaved subroutine [for number of input blocks up to 3]. 2775cdd308eSdjm# This is why it makes no sense to implement 2x subroutine. 2785cdd308eSdjm# aes[enc|dec] latency in next processor generation is 8, but the 2795cdd308eSdjm# instructions can be scheduled every cycle. Optimal interleave for 2805cdd308eSdjm# new processor is therefore 8x... 2816249468aSthibsub aesni_generate3 { 2826249468aSthibmy $dir=shift; 2836249468aSthib# As already mentioned it takes in $key and $rounds, which are *not* 2846249468aSthib# preserved. $inout[0-2] is cipher/clear text... 2856249468aSthib$code.=<<___; 2866249468aSthib.type _aesni_${dir}rypt3,\@abi-omnipotent 2876249468aSthib.align 16 2886249468aSthib_aesni_${dir}rypt3: 28922787c51Stb _CET_ENDBR 2906249468aSthib $movkey ($key),$rndkey0 2916249468aSthib shr \$1,$rounds 2926249468aSthib $movkey 16($key),$rndkey1 2936249468aSthib lea 32($key),$key 2945cdd308eSdjm xorps $rndkey0,$inout0 2955cdd308eSdjm xorps $rndkey0,$inout1 2965cdd308eSdjm xorps $rndkey0,$inout2 2975cdd308eSdjm $movkey ($key),$rndkey0 2986249468aSthib 2996249468aSthib.L${dir}_loop3: 3006249468aSthib aes${dir} $rndkey1,$inout0 3016249468aSthib aes${dir} $rndkey1,$inout1 3026249468aSthib dec $rounds 3036249468aSthib aes${dir} $rndkey1,$inout2 3046249468aSthib $movkey 16($key),$rndkey1 3055cdd308eSdjm aes${dir} $rndkey0,$inout0 3066249468aSthib aes${dir} $rndkey0,$inout1 3076249468aSthib lea 32($key),$key 3086249468aSthib aes${dir} $rndkey0,$inout2 3095cdd308eSdjm $movkey ($key),$rndkey0 3106249468aSthib jnz .L${dir}_loop3 3116249468aSthib 3126249468aSthib aes${dir} $rndkey1,$inout0 3136249468aSthib aes${dir} $rndkey1,$inout1 3146249468aSthib aes${dir} $rndkey1,$inout2 3156249468aSthib aes${dir}last $rndkey0,$inout0 3166249468aSthib aes${dir}last $rndkey0,$inout1 3176249468aSthib aes${dir}last $rndkey0,$inout2 3186249468aSthib ret 3196249468aSthib.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 3206249468aSthib___ 3216249468aSthib} 3226249468aSthib# 4x interleave is implemented to improve small block performance, 3236249468aSthib# most notably [and naturally] 4 block by ~30%. One can argue that one 3246249468aSthib# should have implemented 5x as well, but improvement would be <20%, 3256249468aSthib# so it's not worth it... 3266249468aSthibsub aesni_generate4 { 3276249468aSthibmy $dir=shift; 3286249468aSthib# As already mentioned it takes in $key and $rounds, which are *not* 3296249468aSthib# preserved. $inout[0-3] is cipher/clear text... 3306249468aSthib$code.=<<___; 3316249468aSthib.type _aesni_${dir}rypt4,\@abi-omnipotent 3326249468aSthib.align 16 3336249468aSthib_aesni_${dir}rypt4: 33422787c51Stb _CET_ENDBR 3356249468aSthib $movkey ($key),$rndkey0 3366249468aSthib shr \$1,$rounds 3376249468aSthib $movkey 16($key),$rndkey1 3386249468aSthib lea 32($key),$key 3395cdd308eSdjm xorps $rndkey0,$inout0 3405cdd308eSdjm xorps $rndkey0,$inout1 3415cdd308eSdjm xorps $rndkey0,$inout2 3425cdd308eSdjm xorps $rndkey0,$inout3 3435cdd308eSdjm $movkey ($key),$rndkey0 3446249468aSthib 3456249468aSthib.L${dir}_loop4: 3466249468aSthib aes${dir} $rndkey1,$inout0 3476249468aSthib aes${dir} $rndkey1,$inout1 3486249468aSthib dec $rounds 3496249468aSthib aes${dir} $rndkey1,$inout2 3506249468aSthib aes${dir} $rndkey1,$inout3 3516249468aSthib $movkey 16($key),$rndkey1 3525cdd308eSdjm aes${dir} $rndkey0,$inout0 3536249468aSthib aes${dir} $rndkey0,$inout1 3546249468aSthib lea 32($key),$key 3556249468aSthib aes${dir} $rndkey0,$inout2 3566249468aSthib aes${dir} $rndkey0,$inout3 3575cdd308eSdjm $movkey ($key),$rndkey0 3586249468aSthib jnz .L${dir}_loop4 3596249468aSthib 3606249468aSthib aes${dir} $rndkey1,$inout0 3616249468aSthib aes${dir} $rndkey1,$inout1 3626249468aSthib aes${dir} $rndkey1,$inout2 3636249468aSthib aes${dir} $rndkey1,$inout3 3646249468aSthib aes${dir}last $rndkey0,$inout0 3656249468aSthib aes${dir}last $rndkey0,$inout1 3666249468aSthib aes${dir}last $rndkey0,$inout2 3676249468aSthib aes${dir}last $rndkey0,$inout3 3686249468aSthib ret 3696249468aSthib.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 3706249468aSthib___ 3716249468aSthib} 3725cdd308eSdjmsub aesni_generate6 { 3735cdd308eSdjmmy $dir=shift; 3745cdd308eSdjm# As already mentioned it takes in $key and $rounds, which are *not* 3755cdd308eSdjm# preserved. $inout[0-5] is cipher/clear text... 3765cdd308eSdjm$code.=<<___; 3775cdd308eSdjm.type _aesni_${dir}rypt6,\@abi-omnipotent 3785cdd308eSdjm.align 16 3795cdd308eSdjm_aesni_${dir}rypt6: 38022787c51Stb _CET_ENDBR 3815cdd308eSdjm $movkey ($key),$rndkey0 3825cdd308eSdjm shr \$1,$rounds 3835cdd308eSdjm $movkey 16($key),$rndkey1 3845cdd308eSdjm lea 32($key),$key 3855cdd308eSdjm xorps $rndkey0,$inout0 3865cdd308eSdjm pxor $rndkey0,$inout1 3875cdd308eSdjm aes${dir} $rndkey1,$inout0 3885cdd308eSdjm pxor $rndkey0,$inout2 3895cdd308eSdjm aes${dir} $rndkey1,$inout1 3905cdd308eSdjm pxor $rndkey0,$inout3 3915cdd308eSdjm aes${dir} $rndkey1,$inout2 3925cdd308eSdjm pxor $rndkey0,$inout4 3935cdd308eSdjm aes${dir} $rndkey1,$inout3 3945cdd308eSdjm pxor $rndkey0,$inout5 3955cdd308eSdjm dec $rounds 3965cdd308eSdjm aes${dir} $rndkey1,$inout4 3975cdd308eSdjm $movkey ($key),$rndkey0 3985cdd308eSdjm aes${dir} $rndkey1,$inout5 3995cdd308eSdjm jmp .L${dir}_loop6_enter 4005cdd308eSdjm.align 16 4015cdd308eSdjm.L${dir}_loop6: 4025cdd308eSdjm aes${dir} $rndkey1,$inout0 4035cdd308eSdjm aes${dir} $rndkey1,$inout1 4045cdd308eSdjm dec $rounds 4055cdd308eSdjm aes${dir} $rndkey1,$inout2 4065cdd308eSdjm aes${dir} $rndkey1,$inout3 4075cdd308eSdjm aes${dir} $rndkey1,$inout4 4085cdd308eSdjm aes${dir} $rndkey1,$inout5 4095cdd308eSdjm.L${dir}_loop6_enter: # happens to be 16-byte aligned 4105cdd308eSdjm $movkey 16($key),$rndkey1 4115cdd308eSdjm aes${dir} $rndkey0,$inout0 4125cdd308eSdjm aes${dir} $rndkey0,$inout1 4135cdd308eSdjm lea 32($key),$key 4145cdd308eSdjm aes${dir} $rndkey0,$inout2 4155cdd308eSdjm aes${dir} $rndkey0,$inout3 4165cdd308eSdjm aes${dir} $rndkey0,$inout4 4175cdd308eSdjm aes${dir} $rndkey0,$inout5 4185cdd308eSdjm $movkey ($key),$rndkey0 4195cdd308eSdjm jnz .L${dir}_loop6 4205cdd308eSdjm 4215cdd308eSdjm aes${dir} $rndkey1,$inout0 4225cdd308eSdjm aes${dir} $rndkey1,$inout1 4235cdd308eSdjm aes${dir} $rndkey1,$inout2 4245cdd308eSdjm aes${dir} $rndkey1,$inout3 4255cdd308eSdjm aes${dir} $rndkey1,$inout4 4265cdd308eSdjm aes${dir} $rndkey1,$inout5 4275cdd308eSdjm aes${dir}last $rndkey0,$inout0 4285cdd308eSdjm aes${dir}last $rndkey0,$inout1 4295cdd308eSdjm aes${dir}last $rndkey0,$inout2 4305cdd308eSdjm aes${dir}last $rndkey0,$inout3 4315cdd308eSdjm aes${dir}last $rndkey0,$inout4 4325cdd308eSdjm aes${dir}last $rndkey0,$inout5 4335cdd308eSdjm ret 4345cdd308eSdjm.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 4355cdd308eSdjm___ 4365cdd308eSdjm} 4375cdd308eSdjmsub aesni_generate8 { 4385cdd308eSdjmmy $dir=shift; 4395cdd308eSdjm# As already mentioned it takes in $key and $rounds, which are *not* 4405cdd308eSdjm# preserved. $inout[0-7] is cipher/clear text... 4415cdd308eSdjm$code.=<<___; 4425cdd308eSdjm.type _aesni_${dir}rypt8,\@abi-omnipotent 4435cdd308eSdjm.align 16 4445cdd308eSdjm_aesni_${dir}rypt8: 44522787c51Stb _CET_ENDBR 4465cdd308eSdjm $movkey ($key),$rndkey0 4475cdd308eSdjm shr \$1,$rounds 4485cdd308eSdjm $movkey 16($key),$rndkey1 4495cdd308eSdjm lea 32($key),$key 4505cdd308eSdjm xorps $rndkey0,$inout0 4515cdd308eSdjm xorps $rndkey0,$inout1 4525cdd308eSdjm aes${dir} $rndkey1,$inout0 4535cdd308eSdjm pxor $rndkey0,$inout2 4545cdd308eSdjm aes${dir} $rndkey1,$inout1 4555cdd308eSdjm pxor $rndkey0,$inout3 4565cdd308eSdjm aes${dir} $rndkey1,$inout2 4575cdd308eSdjm pxor $rndkey0,$inout4 4585cdd308eSdjm aes${dir} $rndkey1,$inout3 4595cdd308eSdjm pxor $rndkey0,$inout5 4605cdd308eSdjm dec $rounds 4615cdd308eSdjm aes${dir} $rndkey1,$inout4 4625cdd308eSdjm pxor $rndkey0,$inout6 4635cdd308eSdjm aes${dir} $rndkey1,$inout5 4645cdd308eSdjm pxor $rndkey0,$inout7 4655cdd308eSdjm $movkey ($key),$rndkey0 4665cdd308eSdjm aes${dir} $rndkey1,$inout6 4675cdd308eSdjm aes${dir} $rndkey1,$inout7 4685cdd308eSdjm $movkey 16($key),$rndkey1 4695cdd308eSdjm jmp .L${dir}_loop8_enter 4705cdd308eSdjm.align 16 4715cdd308eSdjm.L${dir}_loop8: 4725cdd308eSdjm aes${dir} $rndkey1,$inout0 4735cdd308eSdjm aes${dir} $rndkey1,$inout1 4745cdd308eSdjm dec $rounds 4755cdd308eSdjm aes${dir} $rndkey1,$inout2 4765cdd308eSdjm aes${dir} $rndkey1,$inout3 4775cdd308eSdjm aes${dir} $rndkey1,$inout4 4785cdd308eSdjm aes${dir} $rndkey1,$inout5 4795cdd308eSdjm aes${dir} $rndkey1,$inout6 4805cdd308eSdjm aes${dir} $rndkey1,$inout7 4815cdd308eSdjm $movkey 16($key),$rndkey1 4825cdd308eSdjm.L${dir}_loop8_enter: # happens to be 16-byte aligned 4835cdd308eSdjm aes${dir} $rndkey0,$inout0 4845cdd308eSdjm aes${dir} $rndkey0,$inout1 4855cdd308eSdjm lea 32($key),$key 4865cdd308eSdjm aes${dir} $rndkey0,$inout2 4875cdd308eSdjm aes${dir} $rndkey0,$inout3 4885cdd308eSdjm aes${dir} $rndkey0,$inout4 4895cdd308eSdjm aes${dir} $rndkey0,$inout5 4905cdd308eSdjm aes${dir} $rndkey0,$inout6 4915cdd308eSdjm aes${dir} $rndkey0,$inout7 4925cdd308eSdjm $movkey ($key),$rndkey0 4935cdd308eSdjm jnz .L${dir}_loop8 4945cdd308eSdjm 4955cdd308eSdjm aes${dir} $rndkey1,$inout0 4965cdd308eSdjm aes${dir} $rndkey1,$inout1 4975cdd308eSdjm aes${dir} $rndkey1,$inout2 4985cdd308eSdjm aes${dir} $rndkey1,$inout3 4995cdd308eSdjm aes${dir} $rndkey1,$inout4 5005cdd308eSdjm aes${dir} $rndkey1,$inout5 5015cdd308eSdjm aes${dir} $rndkey1,$inout6 5025cdd308eSdjm aes${dir} $rndkey1,$inout7 5035cdd308eSdjm aes${dir}last $rndkey0,$inout0 5045cdd308eSdjm aes${dir}last $rndkey0,$inout1 5055cdd308eSdjm aes${dir}last $rndkey0,$inout2 5065cdd308eSdjm aes${dir}last $rndkey0,$inout3 5075cdd308eSdjm aes${dir}last $rndkey0,$inout4 5085cdd308eSdjm aes${dir}last $rndkey0,$inout5 5095cdd308eSdjm aes${dir}last $rndkey0,$inout6 5105cdd308eSdjm aes${dir}last $rndkey0,$inout7 5115cdd308eSdjm ret 5125cdd308eSdjm.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 5135cdd308eSdjm___ 5145cdd308eSdjm} 5156249468aSthib&aesni_generate3("enc") if ($PREFIX eq "aesni"); 5166249468aSthib&aesni_generate3("dec"); 5176249468aSthib&aesni_generate4("enc") if ($PREFIX eq "aesni"); 5186249468aSthib&aesni_generate4("dec"); 5195cdd308eSdjm&aesni_generate6("enc") if ($PREFIX eq "aesni"); 5205cdd308eSdjm&aesni_generate6("dec"); 5215cdd308eSdjm&aesni_generate8("enc") if ($PREFIX eq "aesni"); 5225cdd308eSdjm&aesni_generate8("dec"); 5236249468aSthib 5246249468aSthibif ($PREFIX eq "aesni") { 5255cdd308eSdjm######################################################################## 5266249468aSthib# void aesni_ecb_encrypt (const void *in, void *out, 5276249468aSthib# size_t length, const AES_KEY *key, 5286249468aSthib# int enc); 5296249468aSthib$code.=<<___; 5306249468aSthib.globl aesni_ecb_encrypt 5316249468aSthib.type aesni_ecb_encrypt,\@function,5 5326249468aSthib.align 16 5336249468aSthibaesni_ecb_encrypt: 53422787c51Stb _CET_ENDBR 5356249468aSthib and \$-16,$len 5366249468aSthib jz .Lecb_ret 5376249468aSthib 5385cdd308eSdjm mov 240($key),$rounds # key->rounds 5395cdd308eSdjm $movkey ($key),$rndkey0 5405cdd308eSdjm mov $key,$key_ # backup $key 5415cdd308eSdjm mov $rounds,$rnds_ # backup $rounds 5425cdd308eSdjm test %r8d,%r8d # 5th argument 5435cdd308eSdjm jz .Lecb_decrypt 5445cdd308eSdjm#--------------------------- ECB ENCRYPT ------------------------------# 5455cdd308eSdjm cmp \$0x80,$len 5465cdd308eSdjm jb .Lecb_enc_tail 5475cdd308eSdjm 5485cdd308eSdjm movdqu ($inp),$inout0 5495cdd308eSdjm movdqu 0x10($inp),$inout1 5505cdd308eSdjm movdqu 0x20($inp),$inout2 5515cdd308eSdjm movdqu 0x30($inp),$inout3 5525cdd308eSdjm movdqu 0x40($inp),$inout4 5535cdd308eSdjm movdqu 0x50($inp),$inout5 5545cdd308eSdjm movdqu 0x60($inp),$inout6 5555cdd308eSdjm movdqu 0x70($inp),$inout7 5565cdd308eSdjm lea 0x80($inp),$inp 5575cdd308eSdjm sub \$0x80,$len 5585cdd308eSdjm jmp .Lecb_enc_loop8_enter 5595cdd308eSdjm.align 16 5605cdd308eSdjm.Lecb_enc_loop8: 5615cdd308eSdjm movups $inout0,($out) 5625cdd308eSdjm mov $key_,$key # restore $key 5635cdd308eSdjm movdqu ($inp),$inout0 5645cdd308eSdjm mov $rnds_,$rounds # restore $rounds 5655cdd308eSdjm movups $inout1,0x10($out) 5665cdd308eSdjm movdqu 0x10($inp),$inout1 5675cdd308eSdjm movups $inout2,0x20($out) 5685cdd308eSdjm movdqu 0x20($inp),$inout2 5695cdd308eSdjm movups $inout3,0x30($out) 5705cdd308eSdjm movdqu 0x30($inp),$inout3 5715cdd308eSdjm movups $inout4,0x40($out) 5725cdd308eSdjm movdqu 0x40($inp),$inout4 5735cdd308eSdjm movups $inout5,0x50($out) 5745cdd308eSdjm movdqu 0x50($inp),$inout5 5755cdd308eSdjm movups $inout6,0x60($out) 5765cdd308eSdjm movdqu 0x60($inp),$inout6 5775cdd308eSdjm movups $inout7,0x70($out) 5785cdd308eSdjm lea 0x80($out),$out 5795cdd308eSdjm movdqu 0x70($inp),$inout7 5805cdd308eSdjm lea 0x80($inp),$inp 5815cdd308eSdjm.Lecb_enc_loop8_enter: 5825cdd308eSdjm 5835cdd308eSdjm call _aesni_encrypt8 5845cdd308eSdjm 5855cdd308eSdjm sub \$0x80,$len 5865cdd308eSdjm jnc .Lecb_enc_loop8 5875cdd308eSdjm 5885cdd308eSdjm movups $inout0,($out) 5895cdd308eSdjm mov $key_,$key # restore $key 5905cdd308eSdjm movups $inout1,0x10($out) 5915cdd308eSdjm mov $rnds_,$rounds # restore $rounds 5925cdd308eSdjm movups $inout2,0x20($out) 5935cdd308eSdjm movups $inout3,0x30($out) 5945cdd308eSdjm movups $inout4,0x40($out) 5955cdd308eSdjm movups $inout5,0x50($out) 5965cdd308eSdjm movups $inout6,0x60($out) 5975cdd308eSdjm movups $inout7,0x70($out) 5985cdd308eSdjm lea 0x80($out),$out 5995cdd308eSdjm add \$0x80,$len 6005cdd308eSdjm jz .Lecb_ret 6015cdd308eSdjm 6025cdd308eSdjm.Lecb_enc_tail: 6036249468aSthib movups ($inp),$inout0 6046249468aSthib cmp \$0x20,$len 6055cdd308eSdjm jb .Lecb_enc_one 6066249468aSthib movups 0x10($inp),$inout1 6076249468aSthib je .Lecb_enc_two 6086249468aSthib movups 0x20($inp),$inout2 6095cdd308eSdjm cmp \$0x40,$len 6105cdd308eSdjm jb .Lecb_enc_three 6116249468aSthib movups 0x30($inp),$inout3 6125cdd308eSdjm je .Lecb_enc_four 6135cdd308eSdjm movups 0x40($inp),$inout4 6145cdd308eSdjm cmp \$0x60,$len 6155cdd308eSdjm jb .Lecb_enc_five 6165cdd308eSdjm movups 0x50($inp),$inout5 6175cdd308eSdjm je .Lecb_enc_six 6185cdd308eSdjm movdqu 0x60($inp),$inout6 6195cdd308eSdjm call _aesni_encrypt8 6206249468aSthib movups $inout0,($out) 6216249468aSthib movups $inout1,0x10($out) 6226249468aSthib movups $inout2,0x20($out) 6236249468aSthib movups $inout3,0x30($out) 6245cdd308eSdjm movups $inout4,0x40($out) 6255cdd308eSdjm movups $inout5,0x50($out) 6265cdd308eSdjm movups $inout6,0x60($out) 6276249468aSthib jmp .Lecb_ret 6286249468aSthib.align 16 6296249468aSthib.Lecb_enc_one: 6306249468aSthib___ 6316249468aSthib &aesni_generate1("enc",$key,$rounds); 6326249468aSthib$code.=<<___; 6336249468aSthib movups $inout0,($out) 6346249468aSthib jmp .Lecb_ret 6356249468aSthib.align 16 6366249468aSthib.Lecb_enc_two: 6375cdd308eSdjm xorps $inout2,$inout2 6386249468aSthib call _aesni_encrypt3 6396249468aSthib movups $inout0,($out) 6406249468aSthib movups $inout1,0x10($out) 6416249468aSthib jmp .Lecb_ret 6426249468aSthib.align 16 6436249468aSthib.Lecb_enc_three: 6446249468aSthib call _aesni_encrypt3 6456249468aSthib movups $inout0,($out) 6466249468aSthib movups $inout1,0x10($out) 6476249468aSthib movups $inout2,0x20($out) 6486249468aSthib jmp .Lecb_ret 6496249468aSthib.align 16 6505cdd308eSdjm.Lecb_enc_four: 6515cdd308eSdjm call _aesni_encrypt4 6526249468aSthib movups $inout0,($out) 6536249468aSthib movups $inout1,0x10($out) 6546249468aSthib movups $inout2,0x20($out) 6556249468aSthib movups $inout3,0x30($out) 6566249468aSthib jmp .Lecb_ret 6576249468aSthib.align 16 6585cdd308eSdjm.Lecb_enc_five: 6595cdd308eSdjm xorps $inout5,$inout5 6605cdd308eSdjm call _aesni_encrypt6 6615cdd308eSdjm movups $inout0,($out) 6625cdd308eSdjm movups $inout1,0x10($out) 6635cdd308eSdjm movups $inout2,0x20($out) 6645cdd308eSdjm movups $inout3,0x30($out) 6655cdd308eSdjm movups $inout4,0x40($out) 6665cdd308eSdjm jmp .Lecb_ret 6675cdd308eSdjm.align 16 6685cdd308eSdjm.Lecb_enc_six: 6695cdd308eSdjm call _aesni_encrypt6 6705cdd308eSdjm movups $inout0,($out) 6715cdd308eSdjm movups $inout1,0x10($out) 6725cdd308eSdjm movups $inout2,0x20($out) 6735cdd308eSdjm movups $inout3,0x30($out) 6745cdd308eSdjm movups $inout4,0x40($out) 6755cdd308eSdjm movups $inout5,0x50($out) 6765cdd308eSdjm jmp .Lecb_ret 6775cdd308eSdjm#--------------------------- ECB DECRYPT ------------------------------# 6785cdd308eSdjm.align 16 6795cdd308eSdjm.Lecb_decrypt: 6805cdd308eSdjm cmp \$0x80,$len 6815cdd308eSdjm jb .Lecb_dec_tail 6825cdd308eSdjm 6835cdd308eSdjm movdqu ($inp),$inout0 6845cdd308eSdjm movdqu 0x10($inp),$inout1 6855cdd308eSdjm movdqu 0x20($inp),$inout2 6865cdd308eSdjm movdqu 0x30($inp),$inout3 6875cdd308eSdjm movdqu 0x40($inp),$inout4 6885cdd308eSdjm movdqu 0x50($inp),$inout5 6895cdd308eSdjm movdqu 0x60($inp),$inout6 6905cdd308eSdjm movdqu 0x70($inp),$inout7 6915cdd308eSdjm lea 0x80($inp),$inp 6925cdd308eSdjm sub \$0x80,$len 6935cdd308eSdjm jmp .Lecb_dec_loop8_enter 6945cdd308eSdjm.align 16 6955cdd308eSdjm.Lecb_dec_loop8: 6965cdd308eSdjm movups $inout0,($out) 6975cdd308eSdjm mov $key_,$key # restore $key 6985cdd308eSdjm movdqu ($inp),$inout0 6995cdd308eSdjm mov $rnds_,$rounds # restore $rounds 7005cdd308eSdjm movups $inout1,0x10($out) 7015cdd308eSdjm movdqu 0x10($inp),$inout1 7025cdd308eSdjm movups $inout2,0x20($out) 7035cdd308eSdjm movdqu 0x20($inp),$inout2 7045cdd308eSdjm movups $inout3,0x30($out) 7055cdd308eSdjm movdqu 0x30($inp),$inout3 7065cdd308eSdjm movups $inout4,0x40($out) 7075cdd308eSdjm movdqu 0x40($inp),$inout4 7085cdd308eSdjm movups $inout5,0x50($out) 7095cdd308eSdjm movdqu 0x50($inp),$inout5 7105cdd308eSdjm movups $inout6,0x60($out) 7115cdd308eSdjm movdqu 0x60($inp),$inout6 7125cdd308eSdjm movups $inout7,0x70($out) 7135cdd308eSdjm lea 0x80($out),$out 7145cdd308eSdjm movdqu 0x70($inp),$inout7 7155cdd308eSdjm lea 0x80($inp),$inp 7165cdd308eSdjm.Lecb_dec_loop8_enter: 7175cdd308eSdjm 7185cdd308eSdjm call _aesni_decrypt8 7195cdd308eSdjm 7205cdd308eSdjm $movkey ($key_),$rndkey0 7215cdd308eSdjm sub \$0x80,$len 7225cdd308eSdjm jnc .Lecb_dec_loop8 7235cdd308eSdjm 7245cdd308eSdjm movups $inout0,($out) 7255cdd308eSdjm mov $key_,$key # restore $key 7265cdd308eSdjm movups $inout1,0x10($out) 7275cdd308eSdjm mov $rnds_,$rounds # restore $rounds 7285cdd308eSdjm movups $inout2,0x20($out) 7295cdd308eSdjm movups $inout3,0x30($out) 7305cdd308eSdjm movups $inout4,0x40($out) 7315cdd308eSdjm movups $inout5,0x50($out) 7325cdd308eSdjm movups $inout6,0x60($out) 7335cdd308eSdjm movups $inout7,0x70($out) 7345cdd308eSdjm lea 0x80($out),$out 7355cdd308eSdjm add \$0x80,$len 7365cdd308eSdjm jz .Lecb_ret 7375cdd308eSdjm 7385cdd308eSdjm.Lecb_dec_tail: 7395cdd308eSdjm movups ($inp),$inout0 7405cdd308eSdjm cmp \$0x20,$len 7415cdd308eSdjm jb .Lecb_dec_one 7425cdd308eSdjm movups 0x10($inp),$inout1 7435cdd308eSdjm je .Lecb_dec_two 7445cdd308eSdjm movups 0x20($inp),$inout2 7455cdd308eSdjm cmp \$0x40,$len 7465cdd308eSdjm jb .Lecb_dec_three 7475cdd308eSdjm movups 0x30($inp),$inout3 7485cdd308eSdjm je .Lecb_dec_four 7495cdd308eSdjm movups 0x40($inp),$inout4 7505cdd308eSdjm cmp \$0x60,$len 7515cdd308eSdjm jb .Lecb_dec_five 7525cdd308eSdjm movups 0x50($inp),$inout5 7535cdd308eSdjm je .Lecb_dec_six 7545cdd308eSdjm movups 0x60($inp),$inout6 7555cdd308eSdjm $movkey ($key),$rndkey0 7565cdd308eSdjm call _aesni_decrypt8 7575cdd308eSdjm movups $inout0,($out) 7585cdd308eSdjm movups $inout1,0x10($out) 7595cdd308eSdjm movups $inout2,0x20($out) 7605cdd308eSdjm movups $inout3,0x30($out) 7615cdd308eSdjm movups $inout4,0x40($out) 7625cdd308eSdjm movups $inout5,0x50($out) 7635cdd308eSdjm movups $inout6,0x60($out) 7645cdd308eSdjm jmp .Lecb_ret 7655cdd308eSdjm.align 16 7666249468aSthib.Lecb_dec_one: 7676249468aSthib___ 7686249468aSthib &aesni_generate1("dec",$key,$rounds); 7696249468aSthib$code.=<<___; 7706249468aSthib movups $inout0,($out) 7716249468aSthib jmp .Lecb_ret 7726249468aSthib.align 16 7736249468aSthib.Lecb_dec_two: 7745cdd308eSdjm xorps $inout2,$inout2 7756249468aSthib call _aesni_decrypt3 7766249468aSthib movups $inout0,($out) 7776249468aSthib movups $inout1,0x10($out) 7786249468aSthib jmp .Lecb_ret 7796249468aSthib.align 16 7806249468aSthib.Lecb_dec_three: 7816249468aSthib call _aesni_decrypt3 7826249468aSthib movups $inout0,($out) 7836249468aSthib movups $inout1,0x10($out) 7846249468aSthib movups $inout2,0x20($out) 7855cdd308eSdjm jmp .Lecb_ret 7865cdd308eSdjm.align 16 7875cdd308eSdjm.Lecb_dec_four: 7885cdd308eSdjm call _aesni_decrypt4 7895cdd308eSdjm movups $inout0,($out) 7905cdd308eSdjm movups $inout1,0x10($out) 7915cdd308eSdjm movups $inout2,0x20($out) 7925cdd308eSdjm movups $inout3,0x30($out) 7935cdd308eSdjm jmp .Lecb_ret 7945cdd308eSdjm.align 16 7955cdd308eSdjm.Lecb_dec_five: 7965cdd308eSdjm xorps $inout5,$inout5 7975cdd308eSdjm call _aesni_decrypt6 7985cdd308eSdjm movups $inout0,($out) 7995cdd308eSdjm movups $inout1,0x10($out) 8005cdd308eSdjm movups $inout2,0x20($out) 8015cdd308eSdjm movups $inout3,0x30($out) 8025cdd308eSdjm movups $inout4,0x40($out) 8035cdd308eSdjm jmp .Lecb_ret 8045cdd308eSdjm.align 16 8055cdd308eSdjm.Lecb_dec_six: 8065cdd308eSdjm call _aesni_decrypt6 8075cdd308eSdjm movups $inout0,($out) 8085cdd308eSdjm movups $inout1,0x10($out) 8095cdd308eSdjm movups $inout2,0x20($out) 8105cdd308eSdjm movups $inout3,0x30($out) 8115cdd308eSdjm movups $inout4,0x40($out) 8125cdd308eSdjm movups $inout5,0x50($out) 8136249468aSthib 8146249468aSthib.Lecb_ret: 8156249468aSthib ret 8166249468aSthib.size aesni_ecb_encrypt,.-aesni_ecb_encrypt 8176249468aSthib___ 8185cdd308eSdjm 8195cdd308eSdjm{ 8205cdd308eSdjm###################################################################### 8215cdd308eSdjm# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 8225cdd308eSdjm# size_t blocks, const AES_KEY *key, 8235cdd308eSdjm# const char *ivec,char *cmac); 8245cdd308eSdjm# 8255cdd308eSdjm# Handles only complete blocks, operates on 64-bit counter and 8265cdd308eSdjm# does not update *ivec! Nor does it finalize CMAC value 8275cdd308eSdjm# (see engine/eng_aesni.c for details) 8285cdd308eSdjm# 8295cdd308eSdjm{ 8305cdd308eSdjmmy $cmac="%r9"; # 6th argument 8315cdd308eSdjm 8325cdd308eSdjmmy $increment="%xmm6"; 8335cdd308eSdjmmy $bswap_mask="%xmm7"; 8345cdd308eSdjm 8355cdd308eSdjm$code.=<<___; 8365cdd308eSdjm.globl aesni_ccm64_encrypt_blocks 8375cdd308eSdjm.type aesni_ccm64_encrypt_blocks,\@function,6 8385cdd308eSdjm.align 16 8395cdd308eSdjmaesni_ccm64_encrypt_blocks: 84022787c51Stb _CET_ENDBR 8415cdd308eSdjm___ 8425cdd308eSdjm$code.=<<___ if ($win64); 8435cdd308eSdjm lea -0x58(%rsp),%rsp 8445cdd308eSdjm movaps %xmm6,(%rsp) 8455cdd308eSdjm movaps %xmm7,0x10(%rsp) 8465cdd308eSdjm movaps %xmm8,0x20(%rsp) 8475cdd308eSdjm movaps %xmm9,0x30(%rsp) 8485cdd308eSdjm.Lccm64_enc_body: 8495cdd308eSdjm___ 8505cdd308eSdjm$code.=<<___; 8515cdd308eSdjm mov 240($key),$rounds # key->rounds 8525cdd308eSdjm movdqu ($ivp),$iv 8535cdd308eSdjm movdqa .Lincrement64(%rip),$increment 8545cdd308eSdjm movdqa .Lbswap_mask(%rip),$bswap_mask 8555cdd308eSdjm 8565cdd308eSdjm shr \$1,$rounds 8575cdd308eSdjm lea 0($key),$key_ 8585cdd308eSdjm movdqu ($cmac),$inout1 8595cdd308eSdjm movdqa $iv,$inout0 8605cdd308eSdjm mov $rounds,$rnds_ 8615cdd308eSdjm pshufb $bswap_mask,$iv 8625cdd308eSdjm jmp .Lccm64_enc_outer 8635cdd308eSdjm.align 16 8645cdd308eSdjm.Lccm64_enc_outer: 8655cdd308eSdjm $movkey ($key_),$rndkey0 8665cdd308eSdjm mov $rnds_,$rounds 8675cdd308eSdjm movups ($inp),$in0 # load inp 8685cdd308eSdjm 8695cdd308eSdjm xorps $rndkey0,$inout0 # counter 8705cdd308eSdjm $movkey 16($key_),$rndkey1 8715cdd308eSdjm xorps $in0,$rndkey0 8725cdd308eSdjm lea 32($key_),$key 8735cdd308eSdjm xorps $rndkey0,$inout1 # cmac^=inp 8745cdd308eSdjm $movkey ($key),$rndkey0 8755cdd308eSdjm 8765cdd308eSdjm.Lccm64_enc2_loop: 8775cdd308eSdjm aesenc $rndkey1,$inout0 8785cdd308eSdjm dec $rounds 8795cdd308eSdjm aesenc $rndkey1,$inout1 8805cdd308eSdjm $movkey 16($key),$rndkey1 8815cdd308eSdjm aesenc $rndkey0,$inout0 8825cdd308eSdjm lea 32($key),$key 8835cdd308eSdjm aesenc $rndkey0,$inout1 8845cdd308eSdjm $movkey 0($key),$rndkey0 8855cdd308eSdjm jnz .Lccm64_enc2_loop 8865cdd308eSdjm aesenc $rndkey1,$inout0 8875cdd308eSdjm aesenc $rndkey1,$inout1 8885cdd308eSdjm paddq $increment,$iv 8895cdd308eSdjm aesenclast $rndkey0,$inout0 8905cdd308eSdjm aesenclast $rndkey0,$inout1 8915cdd308eSdjm 8925cdd308eSdjm dec $len 8935cdd308eSdjm lea 16($inp),$inp 8945cdd308eSdjm xorps $inout0,$in0 # inp ^= E(iv) 8955cdd308eSdjm movdqa $iv,$inout0 8965cdd308eSdjm movups $in0,($out) # save output 8975cdd308eSdjm lea 16($out),$out 8985cdd308eSdjm pshufb $bswap_mask,$inout0 8995cdd308eSdjm jnz .Lccm64_enc_outer 9005cdd308eSdjm 9015cdd308eSdjm movups $inout1,($cmac) 9025cdd308eSdjm___ 9035cdd308eSdjm$code.=<<___ if ($win64); 9045cdd308eSdjm movaps (%rsp),%xmm6 9055cdd308eSdjm movaps 0x10(%rsp),%xmm7 9065cdd308eSdjm movaps 0x20(%rsp),%xmm8 9075cdd308eSdjm movaps 0x30(%rsp),%xmm9 9085cdd308eSdjm lea 0x58(%rsp),%rsp 9095cdd308eSdjm.Lccm64_enc_ret: 9105cdd308eSdjm___ 9115cdd308eSdjm$code.=<<___; 9125cdd308eSdjm ret 9135cdd308eSdjm.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks 9145cdd308eSdjm___ 9155cdd308eSdjm###################################################################### 9165cdd308eSdjm$code.=<<___; 9175cdd308eSdjm.globl aesni_ccm64_decrypt_blocks 9185cdd308eSdjm.type aesni_ccm64_decrypt_blocks,\@function,6 9195cdd308eSdjm.align 16 9205cdd308eSdjmaesni_ccm64_decrypt_blocks: 921*5caf18b2Stb _CET_ENDBR 9225cdd308eSdjm___ 9235cdd308eSdjm$code.=<<___ if ($win64); 9245cdd308eSdjm lea -0x58(%rsp),%rsp 9255cdd308eSdjm movaps %xmm6,(%rsp) 9265cdd308eSdjm movaps %xmm7,0x10(%rsp) 9275cdd308eSdjm movaps %xmm8,0x20(%rsp) 9285cdd308eSdjm movaps %xmm9,0x30(%rsp) 9295cdd308eSdjm.Lccm64_dec_body: 9305cdd308eSdjm___ 9315cdd308eSdjm$code.=<<___; 9325cdd308eSdjm mov 240($key),$rounds # key->rounds 9335cdd308eSdjm movups ($ivp),$iv 9345cdd308eSdjm movdqu ($cmac),$inout1 9355cdd308eSdjm movdqa .Lincrement64(%rip),$increment 9365cdd308eSdjm movdqa .Lbswap_mask(%rip),$bswap_mask 9375cdd308eSdjm 9385cdd308eSdjm movaps $iv,$inout0 9395cdd308eSdjm mov $rounds,$rnds_ 9405cdd308eSdjm mov $key,$key_ 9415cdd308eSdjm pshufb $bswap_mask,$iv 9425cdd308eSdjm___ 9435cdd308eSdjm &aesni_generate1("enc",$key,$rounds); 9445cdd308eSdjm$code.=<<___; 9455cdd308eSdjm movups ($inp),$in0 # load inp 9465cdd308eSdjm paddq $increment,$iv 9475cdd308eSdjm lea 16($inp),$inp 9485cdd308eSdjm jmp .Lccm64_dec_outer 9495cdd308eSdjm.align 16 9505cdd308eSdjm.Lccm64_dec_outer: 9515cdd308eSdjm xorps $inout0,$in0 # inp ^= E(iv) 9525cdd308eSdjm movdqa $iv,$inout0 9535cdd308eSdjm mov $rnds_,$rounds 9545cdd308eSdjm movups $in0,($out) # save output 9555cdd308eSdjm lea 16($out),$out 9565cdd308eSdjm pshufb $bswap_mask,$inout0 9575cdd308eSdjm 9585cdd308eSdjm sub \$1,$len 9595cdd308eSdjm jz .Lccm64_dec_break 9605cdd308eSdjm 9615cdd308eSdjm $movkey ($key_),$rndkey0 9625cdd308eSdjm shr \$1,$rounds 9635cdd308eSdjm $movkey 16($key_),$rndkey1 9645cdd308eSdjm xorps $rndkey0,$in0 9655cdd308eSdjm lea 32($key_),$key 9665cdd308eSdjm xorps $rndkey0,$inout0 9675cdd308eSdjm xorps $in0,$inout1 # cmac^=out 9685cdd308eSdjm $movkey ($key),$rndkey0 9695cdd308eSdjm 9705cdd308eSdjm.Lccm64_dec2_loop: 9715cdd308eSdjm aesenc $rndkey1,$inout0 9725cdd308eSdjm dec $rounds 9735cdd308eSdjm aesenc $rndkey1,$inout1 9745cdd308eSdjm $movkey 16($key),$rndkey1 9755cdd308eSdjm aesenc $rndkey0,$inout0 9765cdd308eSdjm lea 32($key),$key 9775cdd308eSdjm aesenc $rndkey0,$inout1 9785cdd308eSdjm $movkey 0($key),$rndkey0 9795cdd308eSdjm jnz .Lccm64_dec2_loop 9805cdd308eSdjm movups ($inp),$in0 # load inp 9815cdd308eSdjm paddq $increment,$iv 9825cdd308eSdjm aesenc $rndkey1,$inout0 9835cdd308eSdjm aesenc $rndkey1,$inout1 9845cdd308eSdjm lea 16($inp),$inp 9855cdd308eSdjm aesenclast $rndkey0,$inout0 9865cdd308eSdjm aesenclast $rndkey0,$inout1 9875cdd308eSdjm jmp .Lccm64_dec_outer 9885cdd308eSdjm 9895cdd308eSdjm.align 16 9905cdd308eSdjm.Lccm64_dec_break: 9915cdd308eSdjm #xorps $in0,$inout1 # cmac^=out 9925cdd308eSdjm___ 9935cdd308eSdjm &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 9945cdd308eSdjm$code.=<<___; 9955cdd308eSdjm movups $inout1,($cmac) 9965cdd308eSdjm___ 9975cdd308eSdjm$code.=<<___ if ($win64); 9985cdd308eSdjm movaps (%rsp),%xmm6 9995cdd308eSdjm movaps 0x10(%rsp),%xmm7 10005cdd308eSdjm movaps 0x20(%rsp),%xmm8 10015cdd308eSdjm movaps 0x30(%rsp),%xmm9 10025cdd308eSdjm lea 0x58(%rsp),%rsp 10035cdd308eSdjm.Lccm64_dec_ret: 10045cdd308eSdjm___ 10055cdd308eSdjm$code.=<<___; 10065cdd308eSdjm ret 10075cdd308eSdjm.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks 10085cdd308eSdjm___ 10095cdd308eSdjm} 10105cdd308eSdjm###################################################################### 10115cdd308eSdjm# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 10125cdd308eSdjm# size_t blocks, const AES_KEY *key, 10135cdd308eSdjm# const char *ivec); 10145cdd308eSdjm# 10155cdd308eSdjm# Handles only complete blocks, operates on 32-bit counter and 10165cdd308eSdjm# does not update *ivec! (see engine/eng_aesni.c for details) 10175cdd308eSdjm# 10185cdd308eSdjm{ 10195c104365Sjsingmy $frame_size = 0x20+($win64?160:0); 10205cdd308eSdjmmy ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11)); 10215cdd308eSdjmmy ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14"); 10225cdd308eSdjmmy $bswap_mask="%xmm15"; 10235cdd308eSdjm 10245cdd308eSdjm$code.=<<___; 10255cdd308eSdjm.globl aesni_ctr32_encrypt_blocks 10265cdd308eSdjm.type aesni_ctr32_encrypt_blocks,\@function,5 10275cdd308eSdjm.align 16 10285cdd308eSdjmaesni_ctr32_encrypt_blocks: 102922787c51Stb _CET_ENDBR 10305c104365Sjsing lea (%rsp),%rax 10315c104365Sjsing push %rbp 10325c104365Sjsing sub \$$frame_size,%rsp 10335cdd308eSdjm___ 10345cdd308eSdjm$code.=<<___ if ($win64); 10355cdd308eSdjm movaps %xmm6,0x20(%rsp) 10365cdd308eSdjm movaps %xmm7,0x30(%rsp) 10375cdd308eSdjm movaps %xmm8,0x40(%rsp) 10385cdd308eSdjm movaps %xmm9,0x50(%rsp) 10395cdd308eSdjm movaps %xmm10,0x60(%rsp) 10405cdd308eSdjm movaps %xmm11,0x70(%rsp) 10415cdd308eSdjm movaps %xmm12,0x80(%rsp) 10425cdd308eSdjm movaps %xmm13,0x90(%rsp) 10435cdd308eSdjm movaps %xmm14,0xa0(%rsp) 10445cdd308eSdjm movaps %xmm15,0xb0(%rsp) 10455cdd308eSdjm.Lctr32_body: 10465cdd308eSdjm___ 10475cdd308eSdjm$code.=<<___; 10485c104365Sjsing lea -8(%rax),%rbp 10495cdd308eSdjm cmp \$1,$len 10505cdd308eSdjm je .Lctr32_one_shortcut 10515cdd308eSdjm 10525cdd308eSdjm movdqu ($ivp),$ivec 10535cdd308eSdjm movdqa .Lbswap_mask(%rip),$bswap_mask 10545cdd308eSdjm xor $rounds,$rounds 10555cdd308eSdjm pextrd \$3,$ivec,$rnds_ # pull 32-bit counter 10565cdd308eSdjm pinsrd \$3,$rounds,$ivec # wipe 32-bit counter 10575cdd308eSdjm 10585cdd308eSdjm mov 240($key),$rounds # key->rounds 10595cdd308eSdjm bswap $rnds_ 10605cdd308eSdjm pxor $iv0,$iv0 # vector of 3 32-bit counters 10615cdd308eSdjm pxor $iv1,$iv1 # vector of 3 32-bit counters 10625cdd308eSdjm pinsrd \$0,$rnds_,$iv0 10635cdd308eSdjm lea 3($rnds_),$key_ 10645cdd308eSdjm pinsrd \$0,$key_,$iv1 10655cdd308eSdjm inc $rnds_ 10665cdd308eSdjm pinsrd \$1,$rnds_,$iv0 10675cdd308eSdjm inc $key_ 10685cdd308eSdjm pinsrd \$1,$key_,$iv1 10695cdd308eSdjm inc $rnds_ 10705cdd308eSdjm pinsrd \$2,$rnds_,$iv0 10715cdd308eSdjm inc $key_ 10725cdd308eSdjm pinsrd \$2,$key_,$iv1 10735c104365Sjsing movdqa $iv0,0x00(%rsp) 10745cdd308eSdjm pshufb $bswap_mask,$iv0 10755c104365Sjsing movdqa $iv1,0x10(%rsp) 10765cdd308eSdjm pshufb $bswap_mask,$iv1 10775cdd308eSdjm 10785cdd308eSdjm pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword 10795cdd308eSdjm pshufd \$`2<<6`,$iv0,$inout1 10805cdd308eSdjm pshufd \$`1<<6`,$iv0,$inout2 10815cdd308eSdjm cmp \$6,$len 10825cdd308eSdjm jb .Lctr32_tail 10835cdd308eSdjm shr \$1,$rounds 10845cdd308eSdjm mov $key,$key_ # backup $key 10855cdd308eSdjm mov $rounds,$rnds_ # backup $rounds 10865cdd308eSdjm sub \$6,$len 10875cdd308eSdjm jmp .Lctr32_loop6 10885cdd308eSdjm 10895cdd308eSdjm.align 16 10905cdd308eSdjm.Lctr32_loop6: 10915cdd308eSdjm pshufd \$`3<<6`,$iv1,$inout3 10925cdd308eSdjm por $ivec,$inout0 # merge counter-less ivec 10935cdd308eSdjm $movkey ($key_),$rndkey0 10945cdd308eSdjm pshufd \$`2<<6`,$iv1,$inout4 10955cdd308eSdjm por $ivec,$inout1 10965cdd308eSdjm $movkey 16($key_),$rndkey1 10975cdd308eSdjm pshufd \$`1<<6`,$iv1,$inout5 10985cdd308eSdjm por $ivec,$inout2 10995cdd308eSdjm por $ivec,$inout3 11005cdd308eSdjm xorps $rndkey0,$inout0 11015cdd308eSdjm por $ivec,$inout4 11025cdd308eSdjm por $ivec,$inout5 11035cdd308eSdjm 11045cdd308eSdjm # inline _aesni_encrypt6 and interleave last rounds 11055cdd308eSdjm # with own code... 11065cdd308eSdjm 11075cdd308eSdjm pxor $rndkey0,$inout1 11085cdd308eSdjm aesenc $rndkey1,$inout0 11095cdd308eSdjm lea 32($key_),$key 11105cdd308eSdjm pxor $rndkey0,$inout2 11115cdd308eSdjm aesenc $rndkey1,$inout1 11125cdd308eSdjm movdqa .Lincrement32(%rip),$iv1 11135cdd308eSdjm pxor $rndkey0,$inout3 11145cdd308eSdjm aesenc $rndkey1,$inout2 11155c104365Sjsing movdqa (%rsp),$iv0 11165cdd308eSdjm pxor $rndkey0,$inout4 11175cdd308eSdjm aesenc $rndkey1,$inout3 11185cdd308eSdjm pxor $rndkey0,$inout5 11195cdd308eSdjm $movkey ($key),$rndkey0 11205cdd308eSdjm dec $rounds 11215cdd308eSdjm aesenc $rndkey1,$inout4 11225cdd308eSdjm aesenc $rndkey1,$inout5 11235cdd308eSdjm jmp .Lctr32_enc_loop6_enter 11245cdd308eSdjm.align 16 11255cdd308eSdjm.Lctr32_enc_loop6: 11265cdd308eSdjm aesenc $rndkey1,$inout0 11275cdd308eSdjm aesenc $rndkey1,$inout1 11285cdd308eSdjm dec $rounds 11295cdd308eSdjm aesenc $rndkey1,$inout2 11305cdd308eSdjm aesenc $rndkey1,$inout3 11315cdd308eSdjm aesenc $rndkey1,$inout4 11325cdd308eSdjm aesenc $rndkey1,$inout5 11335cdd308eSdjm.Lctr32_enc_loop6_enter: 11345cdd308eSdjm $movkey 16($key),$rndkey1 11355cdd308eSdjm aesenc $rndkey0,$inout0 11365cdd308eSdjm aesenc $rndkey0,$inout1 11375cdd308eSdjm lea 32($key),$key 11385cdd308eSdjm aesenc $rndkey0,$inout2 11395cdd308eSdjm aesenc $rndkey0,$inout3 11405cdd308eSdjm aesenc $rndkey0,$inout4 11415cdd308eSdjm aesenc $rndkey0,$inout5 11425cdd308eSdjm $movkey ($key),$rndkey0 11435cdd308eSdjm jnz .Lctr32_enc_loop6 11445cdd308eSdjm 11455cdd308eSdjm aesenc $rndkey1,$inout0 11465cdd308eSdjm paddd $iv1,$iv0 # increment counter vector 11475cdd308eSdjm aesenc $rndkey1,$inout1 11485c104365Sjsing paddd 0x10(%rsp),$iv1 11495cdd308eSdjm aesenc $rndkey1,$inout2 11505c104365Sjsing movdqa $iv0,0x00(%rsp) # save counter vector 11515cdd308eSdjm aesenc $rndkey1,$inout3 11525c104365Sjsing movdqa $iv1,0x10(%rsp) 11535cdd308eSdjm aesenc $rndkey1,$inout4 11545cdd308eSdjm pshufb $bswap_mask,$iv0 # byte swap 11555cdd308eSdjm aesenc $rndkey1,$inout5 11565cdd308eSdjm pshufb $bswap_mask,$iv1 11575cdd308eSdjm 11585cdd308eSdjm aesenclast $rndkey0,$inout0 11595cdd308eSdjm movups ($inp),$in0 # load input 11605cdd308eSdjm aesenclast $rndkey0,$inout1 11615cdd308eSdjm movups 0x10($inp),$in1 11625cdd308eSdjm aesenclast $rndkey0,$inout2 11635cdd308eSdjm movups 0x20($inp),$in2 11645cdd308eSdjm aesenclast $rndkey0,$inout3 11655cdd308eSdjm movups 0x30($inp),$in3 11665cdd308eSdjm aesenclast $rndkey0,$inout4 11675cdd308eSdjm movups 0x40($inp),$rndkey1 11685cdd308eSdjm aesenclast $rndkey0,$inout5 11695cdd308eSdjm movups 0x50($inp),$rndkey0 11705cdd308eSdjm lea 0x60($inp),$inp 11715cdd308eSdjm 11725cdd308eSdjm xorps $inout0,$in0 # xor 11735cdd308eSdjm pshufd \$`3<<6`,$iv0,$inout0 11745cdd308eSdjm xorps $inout1,$in1 11755cdd308eSdjm pshufd \$`2<<6`,$iv0,$inout1 11765cdd308eSdjm movups $in0,($out) # store output 11775cdd308eSdjm xorps $inout2,$in2 11785cdd308eSdjm pshufd \$`1<<6`,$iv0,$inout2 11795cdd308eSdjm movups $in1,0x10($out) 11805cdd308eSdjm xorps $inout3,$in3 11815cdd308eSdjm movups $in2,0x20($out) 11825cdd308eSdjm xorps $inout4,$rndkey1 11835cdd308eSdjm movups $in3,0x30($out) 11845cdd308eSdjm xorps $inout5,$rndkey0 11855cdd308eSdjm movups $rndkey1,0x40($out) 11865cdd308eSdjm movups $rndkey0,0x50($out) 11875cdd308eSdjm lea 0x60($out),$out 11885cdd308eSdjm mov $rnds_,$rounds 11895cdd308eSdjm sub \$6,$len 11905cdd308eSdjm jnc .Lctr32_loop6 11915cdd308eSdjm 11925cdd308eSdjm add \$6,$len 11935cdd308eSdjm jz .Lctr32_done 11945cdd308eSdjm mov $key_,$key # restore $key 11955cdd308eSdjm lea 1($rounds,$rounds),$rounds # restore original value 11965cdd308eSdjm 11975cdd308eSdjm.Lctr32_tail: 11985cdd308eSdjm por $ivec,$inout0 11995cdd308eSdjm movups ($inp),$in0 12005cdd308eSdjm cmp \$2,$len 12015cdd308eSdjm jb .Lctr32_one 12025cdd308eSdjm 12035cdd308eSdjm por $ivec,$inout1 12045cdd308eSdjm movups 0x10($inp),$in1 12055cdd308eSdjm je .Lctr32_two 12065cdd308eSdjm 12075cdd308eSdjm pshufd \$`3<<6`,$iv1,$inout3 12085cdd308eSdjm por $ivec,$inout2 12095cdd308eSdjm movups 0x20($inp),$in2 12105cdd308eSdjm cmp \$4,$len 12115cdd308eSdjm jb .Lctr32_three 12125cdd308eSdjm 12135cdd308eSdjm pshufd \$`2<<6`,$iv1,$inout4 12145cdd308eSdjm por $ivec,$inout3 12155cdd308eSdjm movups 0x30($inp),$in3 12165cdd308eSdjm je .Lctr32_four 12175cdd308eSdjm 12185cdd308eSdjm por $ivec,$inout4 12195cdd308eSdjm xorps $inout5,$inout5 12205cdd308eSdjm 12215cdd308eSdjm call _aesni_encrypt6 12225cdd308eSdjm 12235cdd308eSdjm movups 0x40($inp),$rndkey1 12245cdd308eSdjm xorps $inout0,$in0 12255cdd308eSdjm xorps $inout1,$in1 12265cdd308eSdjm movups $in0,($out) 12275cdd308eSdjm xorps $inout2,$in2 12285cdd308eSdjm movups $in1,0x10($out) 12295cdd308eSdjm xorps $inout3,$in3 12305cdd308eSdjm movups $in2,0x20($out) 12315cdd308eSdjm xorps $inout4,$rndkey1 12325cdd308eSdjm movups $in3,0x30($out) 12335cdd308eSdjm movups $rndkey1,0x40($out) 12345cdd308eSdjm jmp .Lctr32_done 12355cdd308eSdjm 12365cdd308eSdjm.align 16 12375cdd308eSdjm.Lctr32_one_shortcut: 12385cdd308eSdjm movups ($ivp),$inout0 12395cdd308eSdjm movups ($inp),$in0 12405cdd308eSdjm mov 240($key),$rounds # key->rounds 12415cdd308eSdjm.Lctr32_one: 12425cdd308eSdjm___ 12435cdd308eSdjm &aesni_generate1("enc",$key,$rounds); 12445cdd308eSdjm$code.=<<___; 12455cdd308eSdjm xorps $inout0,$in0 12465cdd308eSdjm movups $in0,($out) 12475cdd308eSdjm jmp .Lctr32_done 12485cdd308eSdjm 12495cdd308eSdjm.align 16 12505cdd308eSdjm.Lctr32_two: 12515cdd308eSdjm xorps $inout2,$inout2 12525cdd308eSdjm call _aesni_encrypt3 12535cdd308eSdjm xorps $inout0,$in0 12545cdd308eSdjm xorps $inout1,$in1 12555cdd308eSdjm movups $in0,($out) 12565cdd308eSdjm movups $in1,0x10($out) 12575cdd308eSdjm jmp .Lctr32_done 12585cdd308eSdjm 12595cdd308eSdjm.align 16 12605cdd308eSdjm.Lctr32_three: 12615cdd308eSdjm call _aesni_encrypt3 12625cdd308eSdjm xorps $inout0,$in0 12635cdd308eSdjm xorps $inout1,$in1 12645cdd308eSdjm movups $in0,($out) 12655cdd308eSdjm xorps $inout2,$in2 12665cdd308eSdjm movups $in1,0x10($out) 12675cdd308eSdjm movups $in2,0x20($out) 12685cdd308eSdjm jmp .Lctr32_done 12695cdd308eSdjm 12705cdd308eSdjm.align 16 12715cdd308eSdjm.Lctr32_four: 12725cdd308eSdjm call _aesni_encrypt4 12735cdd308eSdjm xorps $inout0,$in0 12745cdd308eSdjm xorps $inout1,$in1 12755cdd308eSdjm movups $in0,($out) 12765cdd308eSdjm xorps $inout2,$in2 12775cdd308eSdjm movups $in1,0x10($out) 12785cdd308eSdjm xorps $inout3,$in3 12795cdd308eSdjm movups $in2,0x20($out) 12805cdd308eSdjm movups $in3,0x30($out) 12815cdd308eSdjm 12825cdd308eSdjm.Lctr32_done: 12835cdd308eSdjm___ 12845cdd308eSdjm$code.=<<___ if ($win64); 12855cdd308eSdjm movaps 0x20(%rsp),%xmm6 12865cdd308eSdjm movaps 0x30(%rsp),%xmm7 12875cdd308eSdjm movaps 0x40(%rsp),%xmm8 12885cdd308eSdjm movaps 0x50(%rsp),%xmm9 12895cdd308eSdjm movaps 0x60(%rsp),%xmm10 12905cdd308eSdjm movaps 0x70(%rsp),%xmm11 12915cdd308eSdjm movaps 0x80(%rsp),%xmm12 12925cdd308eSdjm movaps 0x90(%rsp),%xmm13 12935cdd308eSdjm movaps 0xa0(%rsp),%xmm14 12945cdd308eSdjm movaps 0xb0(%rsp),%xmm15 12955cdd308eSdjm___ 12965cdd308eSdjm$code.=<<___; 12975c104365Sjsing lea (%rbp),%rsp 12985c104365Sjsing pop %rbp 12995c104365Sjsing.Lctr32_ret: 13005cdd308eSdjm ret 13015cdd308eSdjm.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks 13025cdd308eSdjm___ 13036249468aSthib} 13046249468aSthib 13055cdd308eSdjm###################################################################### 13065cdd308eSdjm# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 13075cdd308eSdjm# const AES_KEY *key1, const AES_KEY *key2 13085cdd308eSdjm# const unsigned char iv[16]); 13095cdd308eSdjm# 13105cdd308eSdjm{ 13115cdd308eSdjmmy @tweak=map("%xmm$_",(10..15)); 13125cdd308eSdjmmy ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 13135cdd308eSdjmmy ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 13145c104365Sjsingmy $frame_size = 0x60 + ($win64?160:0); 13155cdd308eSdjm 13165cdd308eSdjm$code.=<<___; 13175cdd308eSdjm.globl aesni_xts_encrypt 13185cdd308eSdjm.type aesni_xts_encrypt,\@function,6 13195cdd308eSdjm.align 16 13205cdd308eSdjmaesni_xts_encrypt: 1321*5caf18b2Stb _CET_ENDBR 13225c104365Sjsing lea (%rsp),%rax 13235c104365Sjsing push %rbp 13245c104365Sjsing sub \$$frame_size,%rsp 13255cdd308eSdjm___ 13265cdd308eSdjm$code.=<<___ if ($win64); 13275cdd308eSdjm movaps %xmm6,0x60(%rsp) 13285cdd308eSdjm movaps %xmm7,0x70(%rsp) 13295cdd308eSdjm movaps %xmm8,0x80(%rsp) 13305cdd308eSdjm movaps %xmm9,0x90(%rsp) 13315cdd308eSdjm movaps %xmm10,0xa0(%rsp) 13325cdd308eSdjm movaps %xmm11,0xb0(%rsp) 13335cdd308eSdjm movaps %xmm12,0xc0(%rsp) 13345cdd308eSdjm movaps %xmm13,0xd0(%rsp) 13355cdd308eSdjm movaps %xmm14,0xe0(%rsp) 13365cdd308eSdjm movaps %xmm15,0xf0(%rsp) 13375cdd308eSdjm.Lxts_enc_body: 13385cdd308eSdjm___ 13395cdd308eSdjm$code.=<<___; 13405c104365Sjsing lea -8(%rax),%rbp 13415cdd308eSdjm movups ($ivp),@tweak[5] # load clear-text tweak 13425cdd308eSdjm mov 240(%r8),$rounds # key2->rounds 13435cdd308eSdjm mov 240($key),$rnds_ # key1->rounds 13445cdd308eSdjm___ 13455cdd308eSdjm # generate the tweak 13465cdd308eSdjm &aesni_generate1("enc",$key2,$rounds,@tweak[5]); 13475cdd308eSdjm$code.=<<___; 13485cdd308eSdjm mov $key,$key_ # backup $key 13495cdd308eSdjm mov $rnds_,$rounds # backup $rounds 13505cdd308eSdjm mov $len,$len_ # backup $len 13515cdd308eSdjm and \$-16,$len 13525cdd308eSdjm 13535cdd308eSdjm movdqa .Lxts_magic(%rip),$twmask 13545cdd308eSdjm pxor $twtmp,$twtmp 13555cdd308eSdjm pcmpgtd @tweak[5],$twtmp # broadcast upper bits 13565cdd308eSdjm___ 13575cdd308eSdjm for ($i=0;$i<4;$i++) { 13585cdd308eSdjm $code.=<<___; 13595cdd308eSdjm pshufd \$0x13,$twtmp,$twres 13605cdd308eSdjm pxor $twtmp,$twtmp 13615cdd308eSdjm movdqa @tweak[5],@tweak[$i] 13625cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 13635cdd308eSdjm pand $twmask,$twres # isolate carry and residue 136471743258Sjmc pcmpgtd @tweak[5],$twtmp # broadcast upper bits 13655cdd308eSdjm pxor $twres,@tweak[5] 13665cdd308eSdjm___ 13675cdd308eSdjm } 13685cdd308eSdjm$code.=<<___; 13695cdd308eSdjm sub \$16*6,$len 13705cdd308eSdjm jc .Lxts_enc_short 13715cdd308eSdjm 13725cdd308eSdjm shr \$1,$rounds 13735cdd308eSdjm sub \$1,$rounds 13745cdd308eSdjm mov $rounds,$rnds_ 13755cdd308eSdjm jmp .Lxts_enc_grandloop 13765cdd308eSdjm 13775cdd308eSdjm.align 16 13785cdd308eSdjm.Lxts_enc_grandloop: 13795cdd308eSdjm pshufd \$0x13,$twtmp,$twres 13805cdd308eSdjm movdqa @tweak[5],@tweak[4] 13815cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 13825cdd308eSdjm movdqu `16*0`($inp),$inout0 # load input 13835cdd308eSdjm pand $twmask,$twres # isolate carry and residue 13845cdd308eSdjm movdqu `16*1`($inp),$inout1 13855cdd308eSdjm pxor $twres,@tweak[5] 13865cdd308eSdjm 13875cdd308eSdjm movdqu `16*2`($inp),$inout2 13885cdd308eSdjm pxor @tweak[0],$inout0 # input^=tweak 13895cdd308eSdjm movdqu `16*3`($inp),$inout3 13905cdd308eSdjm pxor @tweak[1],$inout1 13915cdd308eSdjm movdqu `16*4`($inp),$inout4 13925cdd308eSdjm pxor @tweak[2],$inout2 13935cdd308eSdjm movdqu `16*5`($inp),$inout5 13945cdd308eSdjm lea `16*6`($inp),$inp 13955cdd308eSdjm pxor @tweak[3],$inout3 13965cdd308eSdjm $movkey ($key_),$rndkey0 13975cdd308eSdjm pxor @tweak[4],$inout4 13985cdd308eSdjm pxor @tweak[5],$inout5 13995cdd308eSdjm 14005cdd308eSdjm # inline _aesni_encrypt6 and interleave first and last rounds 14015cdd308eSdjm # with own code... 14025cdd308eSdjm $movkey 16($key_),$rndkey1 14035cdd308eSdjm pxor $rndkey0,$inout0 14045cdd308eSdjm pxor $rndkey0,$inout1 14055cdd308eSdjm movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks 14065cdd308eSdjm aesenc $rndkey1,$inout0 14075cdd308eSdjm lea 32($key_),$key 14085cdd308eSdjm pxor $rndkey0,$inout2 14095cdd308eSdjm movdqa @tweak[1],`16*1`(%rsp) 14105cdd308eSdjm aesenc $rndkey1,$inout1 14115cdd308eSdjm pxor $rndkey0,$inout3 14125cdd308eSdjm movdqa @tweak[2],`16*2`(%rsp) 14135cdd308eSdjm aesenc $rndkey1,$inout2 14145cdd308eSdjm pxor $rndkey0,$inout4 14155cdd308eSdjm movdqa @tweak[3],`16*3`(%rsp) 14165cdd308eSdjm aesenc $rndkey1,$inout3 14175cdd308eSdjm pxor $rndkey0,$inout5 14185cdd308eSdjm $movkey ($key),$rndkey0 14195cdd308eSdjm dec $rounds 14205cdd308eSdjm movdqa @tweak[4],`16*4`(%rsp) 14215cdd308eSdjm aesenc $rndkey1,$inout4 14225cdd308eSdjm movdqa @tweak[5],`16*5`(%rsp) 14235cdd308eSdjm aesenc $rndkey1,$inout5 14245cdd308eSdjm pxor $twtmp,$twtmp 14255cdd308eSdjm pcmpgtd @tweak[5],$twtmp 14265cdd308eSdjm jmp .Lxts_enc_loop6_enter 14275cdd308eSdjm 14285cdd308eSdjm.align 16 14295cdd308eSdjm.Lxts_enc_loop6: 14305cdd308eSdjm aesenc $rndkey1,$inout0 14315cdd308eSdjm aesenc $rndkey1,$inout1 14325cdd308eSdjm dec $rounds 14335cdd308eSdjm aesenc $rndkey1,$inout2 14345cdd308eSdjm aesenc $rndkey1,$inout3 14355cdd308eSdjm aesenc $rndkey1,$inout4 14365cdd308eSdjm aesenc $rndkey1,$inout5 14375cdd308eSdjm.Lxts_enc_loop6_enter: 14385cdd308eSdjm $movkey 16($key),$rndkey1 14395cdd308eSdjm aesenc $rndkey0,$inout0 14405cdd308eSdjm aesenc $rndkey0,$inout1 14415cdd308eSdjm lea 32($key),$key 14425cdd308eSdjm aesenc $rndkey0,$inout2 14435cdd308eSdjm aesenc $rndkey0,$inout3 14445cdd308eSdjm aesenc $rndkey0,$inout4 14455cdd308eSdjm aesenc $rndkey0,$inout5 14465cdd308eSdjm $movkey ($key),$rndkey0 14475cdd308eSdjm jnz .Lxts_enc_loop6 14485cdd308eSdjm 14495cdd308eSdjm pshufd \$0x13,$twtmp,$twres 14505cdd308eSdjm pxor $twtmp,$twtmp 14515cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 14525cdd308eSdjm aesenc $rndkey1,$inout0 14535cdd308eSdjm pand $twmask,$twres # isolate carry and residue 14545cdd308eSdjm aesenc $rndkey1,$inout1 14555cdd308eSdjm pcmpgtd @tweak[5],$twtmp # broadcast upper bits 14565cdd308eSdjm aesenc $rndkey1,$inout2 14575cdd308eSdjm pxor $twres,@tweak[5] 14585cdd308eSdjm aesenc $rndkey1,$inout3 14595cdd308eSdjm aesenc $rndkey1,$inout4 14605cdd308eSdjm aesenc $rndkey1,$inout5 14615cdd308eSdjm $movkey 16($key),$rndkey1 14625cdd308eSdjm 14635cdd308eSdjm pshufd \$0x13,$twtmp,$twres 14645cdd308eSdjm pxor $twtmp,$twtmp 14655cdd308eSdjm movdqa @tweak[5],@tweak[0] 14665cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 14675cdd308eSdjm aesenc $rndkey0,$inout0 14685cdd308eSdjm pand $twmask,$twres # isolate carry and residue 14695cdd308eSdjm aesenc $rndkey0,$inout1 147071743258Sjmc pcmpgtd @tweak[5],$twtmp # broadcast upper bits 14715cdd308eSdjm aesenc $rndkey0,$inout2 14725cdd308eSdjm pxor $twres,@tweak[5] 14735cdd308eSdjm aesenc $rndkey0,$inout3 14745cdd308eSdjm aesenc $rndkey0,$inout4 14755cdd308eSdjm aesenc $rndkey0,$inout5 14765cdd308eSdjm $movkey 32($key),$rndkey0 14775cdd308eSdjm 14785cdd308eSdjm pshufd \$0x13,$twtmp,$twres 14795cdd308eSdjm pxor $twtmp,$twtmp 14805cdd308eSdjm movdqa @tweak[5],@tweak[1] 14815cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 14825cdd308eSdjm aesenc $rndkey1,$inout0 14835cdd308eSdjm pand $twmask,$twres # isolate carry and residue 14845cdd308eSdjm aesenc $rndkey1,$inout1 148571743258Sjmc pcmpgtd @tweak[5],$twtmp # broadcast upper bits 14865cdd308eSdjm aesenc $rndkey1,$inout2 14875cdd308eSdjm pxor $twres,@tweak[5] 14885cdd308eSdjm aesenc $rndkey1,$inout3 14895cdd308eSdjm aesenc $rndkey1,$inout4 14905cdd308eSdjm aesenc $rndkey1,$inout5 14915cdd308eSdjm 14925cdd308eSdjm pshufd \$0x13,$twtmp,$twres 14935cdd308eSdjm pxor $twtmp,$twtmp 14945cdd308eSdjm movdqa @tweak[5],@tweak[2] 14955cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 14965cdd308eSdjm aesenclast $rndkey0,$inout0 14975cdd308eSdjm pand $twmask,$twres # isolate carry and residue 14985cdd308eSdjm aesenclast $rndkey0,$inout1 149971743258Sjmc pcmpgtd @tweak[5],$twtmp # broadcast upper bits 15005cdd308eSdjm aesenclast $rndkey0,$inout2 15015cdd308eSdjm pxor $twres,@tweak[5] 15025cdd308eSdjm aesenclast $rndkey0,$inout3 15035cdd308eSdjm aesenclast $rndkey0,$inout4 15045cdd308eSdjm aesenclast $rndkey0,$inout5 15055cdd308eSdjm 15065cdd308eSdjm pshufd \$0x13,$twtmp,$twres 15075cdd308eSdjm pxor $twtmp,$twtmp 15085cdd308eSdjm movdqa @tweak[5],@tweak[3] 15095cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 15105cdd308eSdjm xorps `16*0`(%rsp),$inout0 # output^=tweak 15115cdd308eSdjm pand $twmask,$twres # isolate carry and residue 15125cdd308eSdjm xorps `16*1`(%rsp),$inout1 151371743258Sjmc pcmpgtd @tweak[5],$twtmp # broadcast upper bits 15145cdd308eSdjm pxor $twres,@tweak[5] 15155cdd308eSdjm 15165cdd308eSdjm xorps `16*2`(%rsp),$inout2 15175cdd308eSdjm movups $inout0,`16*0`($out) # write output 15185cdd308eSdjm xorps `16*3`(%rsp),$inout3 15195cdd308eSdjm movups $inout1,`16*1`($out) 15205cdd308eSdjm xorps `16*4`(%rsp),$inout4 15215cdd308eSdjm movups $inout2,`16*2`($out) 15225cdd308eSdjm xorps `16*5`(%rsp),$inout5 15235cdd308eSdjm movups $inout3,`16*3`($out) 15245cdd308eSdjm mov $rnds_,$rounds # restore $rounds 15255cdd308eSdjm movups $inout4,`16*4`($out) 15265cdd308eSdjm movups $inout5,`16*5`($out) 15275cdd308eSdjm lea `16*6`($out),$out 15285cdd308eSdjm sub \$16*6,$len 15295cdd308eSdjm jnc .Lxts_enc_grandloop 15305cdd308eSdjm 15315cdd308eSdjm lea 3($rounds,$rounds),$rounds # restore original value 15325cdd308eSdjm mov $key_,$key # restore $key 15335cdd308eSdjm mov $rounds,$rnds_ # backup $rounds 15345cdd308eSdjm 15355cdd308eSdjm.Lxts_enc_short: 15365cdd308eSdjm add \$16*6,$len 15375cdd308eSdjm jz .Lxts_enc_done 15385cdd308eSdjm 15395cdd308eSdjm cmp \$0x20,$len 15405cdd308eSdjm jb .Lxts_enc_one 15415cdd308eSdjm je .Lxts_enc_two 15425cdd308eSdjm 15435cdd308eSdjm cmp \$0x40,$len 15445cdd308eSdjm jb .Lxts_enc_three 15455cdd308eSdjm je .Lxts_enc_four 15465cdd308eSdjm 15475cdd308eSdjm pshufd \$0x13,$twtmp,$twres 15485cdd308eSdjm movdqa @tweak[5],@tweak[4] 15495cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 15505cdd308eSdjm movdqu ($inp),$inout0 15515cdd308eSdjm pand $twmask,$twres # isolate carry and residue 15525cdd308eSdjm movdqu 16*1($inp),$inout1 15535cdd308eSdjm pxor $twres,@tweak[5] 15545cdd308eSdjm 15555cdd308eSdjm movdqu 16*2($inp),$inout2 15565cdd308eSdjm pxor @tweak[0],$inout0 15575cdd308eSdjm movdqu 16*3($inp),$inout3 15585cdd308eSdjm pxor @tweak[1],$inout1 15595cdd308eSdjm movdqu 16*4($inp),$inout4 15605cdd308eSdjm lea 16*5($inp),$inp 15615cdd308eSdjm pxor @tweak[2],$inout2 15625cdd308eSdjm pxor @tweak[3],$inout3 15635cdd308eSdjm pxor @tweak[4],$inout4 15645cdd308eSdjm 15655cdd308eSdjm call _aesni_encrypt6 15665cdd308eSdjm 15675cdd308eSdjm xorps @tweak[0],$inout0 15685cdd308eSdjm movdqa @tweak[5],@tweak[0] 15695cdd308eSdjm xorps @tweak[1],$inout1 15705cdd308eSdjm xorps @tweak[2],$inout2 15715cdd308eSdjm movdqu $inout0,($out) 15725cdd308eSdjm xorps @tweak[3],$inout3 15735cdd308eSdjm movdqu $inout1,16*1($out) 15745cdd308eSdjm xorps @tweak[4],$inout4 15755cdd308eSdjm movdqu $inout2,16*2($out) 15765cdd308eSdjm movdqu $inout3,16*3($out) 15775cdd308eSdjm movdqu $inout4,16*4($out) 15785cdd308eSdjm lea 16*5($out),$out 15795cdd308eSdjm jmp .Lxts_enc_done 15805cdd308eSdjm 15815cdd308eSdjm.align 16 15825cdd308eSdjm.Lxts_enc_one: 15835cdd308eSdjm movups ($inp),$inout0 15845cdd308eSdjm lea 16*1($inp),$inp 15855cdd308eSdjm xorps @tweak[0],$inout0 15865cdd308eSdjm___ 15875cdd308eSdjm &aesni_generate1("enc",$key,$rounds); 15885cdd308eSdjm$code.=<<___; 15895cdd308eSdjm xorps @tweak[0],$inout0 15905cdd308eSdjm movdqa @tweak[1],@tweak[0] 15915cdd308eSdjm movups $inout0,($out) 15925cdd308eSdjm lea 16*1($out),$out 15935cdd308eSdjm jmp .Lxts_enc_done 15945cdd308eSdjm 15955cdd308eSdjm.align 16 15965cdd308eSdjm.Lxts_enc_two: 15975cdd308eSdjm movups ($inp),$inout0 15985cdd308eSdjm movups 16($inp),$inout1 15995cdd308eSdjm lea 32($inp),$inp 16005cdd308eSdjm xorps @tweak[0],$inout0 16015cdd308eSdjm xorps @tweak[1],$inout1 16025cdd308eSdjm 16035cdd308eSdjm call _aesni_encrypt3 16045cdd308eSdjm 16055cdd308eSdjm xorps @tweak[0],$inout0 16065cdd308eSdjm movdqa @tweak[2],@tweak[0] 16075cdd308eSdjm xorps @tweak[1],$inout1 16085cdd308eSdjm movups $inout0,($out) 16095cdd308eSdjm movups $inout1,16*1($out) 16105cdd308eSdjm lea 16*2($out),$out 16115cdd308eSdjm jmp .Lxts_enc_done 16125cdd308eSdjm 16135cdd308eSdjm.align 16 16145cdd308eSdjm.Lxts_enc_three: 16155cdd308eSdjm movups ($inp),$inout0 16165cdd308eSdjm movups 16*1($inp),$inout1 16175cdd308eSdjm movups 16*2($inp),$inout2 16185cdd308eSdjm lea 16*3($inp),$inp 16195cdd308eSdjm xorps @tweak[0],$inout0 16205cdd308eSdjm xorps @tweak[1],$inout1 16215cdd308eSdjm xorps @tweak[2],$inout2 16225cdd308eSdjm 16235cdd308eSdjm call _aesni_encrypt3 16245cdd308eSdjm 16255cdd308eSdjm xorps @tweak[0],$inout0 16265cdd308eSdjm movdqa @tweak[3],@tweak[0] 16275cdd308eSdjm xorps @tweak[1],$inout1 16285cdd308eSdjm xorps @tweak[2],$inout2 16295cdd308eSdjm movups $inout0,($out) 16305cdd308eSdjm movups $inout1,16*1($out) 16315cdd308eSdjm movups $inout2,16*2($out) 16325cdd308eSdjm lea 16*3($out),$out 16335cdd308eSdjm jmp .Lxts_enc_done 16345cdd308eSdjm 16355cdd308eSdjm.align 16 16365cdd308eSdjm.Lxts_enc_four: 16375cdd308eSdjm movups ($inp),$inout0 16385cdd308eSdjm movups 16*1($inp),$inout1 16395cdd308eSdjm movups 16*2($inp),$inout2 16405cdd308eSdjm xorps @tweak[0],$inout0 16415cdd308eSdjm movups 16*3($inp),$inout3 16425cdd308eSdjm lea 16*4($inp),$inp 16435cdd308eSdjm xorps @tweak[1],$inout1 16445cdd308eSdjm xorps @tweak[2],$inout2 16455cdd308eSdjm xorps @tweak[3],$inout3 16465cdd308eSdjm 16475cdd308eSdjm call _aesni_encrypt4 16485cdd308eSdjm 16495cdd308eSdjm xorps @tweak[0],$inout0 16505cdd308eSdjm movdqa @tweak[5],@tweak[0] 16515cdd308eSdjm xorps @tweak[1],$inout1 16525cdd308eSdjm xorps @tweak[2],$inout2 16535cdd308eSdjm movups $inout0,($out) 16545cdd308eSdjm xorps @tweak[3],$inout3 16555cdd308eSdjm movups $inout1,16*1($out) 16565cdd308eSdjm movups $inout2,16*2($out) 16575cdd308eSdjm movups $inout3,16*3($out) 16585cdd308eSdjm lea 16*4($out),$out 16595cdd308eSdjm jmp .Lxts_enc_done 16605cdd308eSdjm 16615cdd308eSdjm.align 16 16625cdd308eSdjm.Lxts_enc_done: 16635cdd308eSdjm and \$15,$len_ 16645cdd308eSdjm jz .Lxts_enc_ret 16655cdd308eSdjm mov $len_,$len 16665cdd308eSdjm 16675cdd308eSdjm.Lxts_enc_steal: 16685cdd308eSdjm movzb ($inp),%eax # borrow $rounds ... 16695cdd308eSdjm movzb -16($out),%ecx # ... and $key 16705cdd308eSdjm lea 1($inp),$inp 16715cdd308eSdjm mov %al,-16($out) 16725cdd308eSdjm mov %cl,0($out) 16735cdd308eSdjm lea 1($out),$out 16745cdd308eSdjm sub \$1,$len 16755cdd308eSdjm jnz .Lxts_enc_steal 16765cdd308eSdjm 16775cdd308eSdjm sub $len_,$out # rewind $out 16785cdd308eSdjm mov $key_,$key # restore $key 16795cdd308eSdjm mov $rnds_,$rounds # restore $rounds 16805cdd308eSdjm 16815cdd308eSdjm movups -16($out),$inout0 16825cdd308eSdjm xorps @tweak[0],$inout0 16835cdd308eSdjm___ 16845cdd308eSdjm &aesni_generate1("enc",$key,$rounds); 16855cdd308eSdjm$code.=<<___; 16865cdd308eSdjm xorps @tweak[0],$inout0 16875cdd308eSdjm movups $inout0,-16($out) 16885cdd308eSdjm 16895cdd308eSdjm.Lxts_enc_ret: 16905cdd308eSdjm___ 16915cdd308eSdjm$code.=<<___ if ($win64); 16925cdd308eSdjm movaps 0x60(%rsp),%xmm6 16935cdd308eSdjm movaps 0x70(%rsp),%xmm7 16945cdd308eSdjm movaps 0x80(%rsp),%xmm8 16955cdd308eSdjm movaps 0x90(%rsp),%xmm9 16965cdd308eSdjm movaps 0xa0(%rsp),%xmm10 16975cdd308eSdjm movaps 0xb0(%rsp),%xmm11 16985cdd308eSdjm movaps 0xc0(%rsp),%xmm12 16995cdd308eSdjm movaps 0xd0(%rsp),%xmm13 17005cdd308eSdjm movaps 0xe0(%rsp),%xmm14 17015cdd308eSdjm movaps 0xf0(%rsp),%xmm15 17025cdd308eSdjm___ 17035cdd308eSdjm$code.=<<___; 17045c104365Sjsing lea (%rbp),%rsp 17055c104365Sjsing pop %rbp 17065cdd308eSdjm.Lxts_enc_epilogue: 17075cdd308eSdjm ret 17085cdd308eSdjm.size aesni_xts_encrypt,.-aesni_xts_encrypt 17095cdd308eSdjm___ 17105cdd308eSdjm 17115cdd308eSdjm$code.=<<___; 17125cdd308eSdjm.globl aesni_xts_decrypt 17135cdd308eSdjm.type aesni_xts_decrypt,\@function,6 17145cdd308eSdjm.align 16 17155cdd308eSdjmaesni_xts_decrypt: 1716*5caf18b2Stb _CET_ENDBR 17175c104365Sjsing lea (%rsp),%rax 17185c104365Sjsing push %rbp 17195c104365Sjsing sub \$$frame_size,%rsp 17205cdd308eSdjm___ 17215cdd308eSdjm$code.=<<___ if ($win64); 17225cdd308eSdjm movaps %xmm6,0x60(%rsp) 17235cdd308eSdjm movaps %xmm7,0x70(%rsp) 17245cdd308eSdjm movaps %xmm8,0x80(%rsp) 17255cdd308eSdjm movaps %xmm9,0x90(%rsp) 17265cdd308eSdjm movaps %xmm10,0xa0(%rsp) 17275cdd308eSdjm movaps %xmm11,0xb0(%rsp) 17285cdd308eSdjm movaps %xmm12,0xc0(%rsp) 17295cdd308eSdjm movaps %xmm13,0xd0(%rsp) 17305cdd308eSdjm movaps %xmm14,0xe0(%rsp) 17315cdd308eSdjm movaps %xmm15,0xf0(%rsp) 17325cdd308eSdjm.Lxts_dec_body: 17335cdd308eSdjm___ 17345cdd308eSdjm$code.=<<___; 17355c104365Sjsing lea -8(%rax),%rbp 17365cdd308eSdjm movups ($ivp),@tweak[5] # load clear-text tweak 17375cdd308eSdjm mov 240($key2),$rounds # key2->rounds 17385cdd308eSdjm mov 240($key),$rnds_ # key1->rounds 17395cdd308eSdjm___ 17405cdd308eSdjm # generate the tweak 17415cdd308eSdjm &aesni_generate1("enc",$key2,$rounds,@tweak[5]); 17425cdd308eSdjm$code.=<<___; 17435cdd308eSdjm xor %eax,%eax # if ($len%16) len-=16; 17445cdd308eSdjm test \$15,$len 17455cdd308eSdjm setnz %al 17465cdd308eSdjm shl \$4,%rax 17475cdd308eSdjm sub %rax,$len 17485cdd308eSdjm 17495cdd308eSdjm mov $key,$key_ # backup $key 17505cdd308eSdjm mov $rnds_,$rounds # backup $rounds 17515cdd308eSdjm mov $len,$len_ # backup $len 17525cdd308eSdjm and \$-16,$len 17535cdd308eSdjm 17545cdd308eSdjm movdqa .Lxts_magic(%rip),$twmask 17555cdd308eSdjm pxor $twtmp,$twtmp 17565cdd308eSdjm pcmpgtd @tweak[5],$twtmp # broadcast upper bits 17575cdd308eSdjm___ 17585cdd308eSdjm for ($i=0;$i<4;$i++) { 17595cdd308eSdjm $code.=<<___; 17605cdd308eSdjm pshufd \$0x13,$twtmp,$twres 17615cdd308eSdjm pxor $twtmp,$twtmp 17625cdd308eSdjm movdqa @tweak[5],@tweak[$i] 17635cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 17645cdd308eSdjm pand $twmask,$twres # isolate carry and residue 176571743258Sjmc pcmpgtd @tweak[5],$twtmp # broadcast upper bits 17665cdd308eSdjm pxor $twres,@tweak[5] 17675cdd308eSdjm___ 17685cdd308eSdjm } 17695cdd308eSdjm$code.=<<___; 17705cdd308eSdjm sub \$16*6,$len 17715cdd308eSdjm jc .Lxts_dec_short 17725cdd308eSdjm 17735cdd308eSdjm shr \$1,$rounds 17745cdd308eSdjm sub \$1,$rounds 17755cdd308eSdjm mov $rounds,$rnds_ 17765cdd308eSdjm jmp .Lxts_dec_grandloop 17775cdd308eSdjm 17785cdd308eSdjm.align 16 17795cdd308eSdjm.Lxts_dec_grandloop: 17805cdd308eSdjm pshufd \$0x13,$twtmp,$twres 17815cdd308eSdjm movdqa @tweak[5],@tweak[4] 17825cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 17835cdd308eSdjm movdqu `16*0`($inp),$inout0 # load input 17845cdd308eSdjm pand $twmask,$twres # isolate carry and residue 17855cdd308eSdjm movdqu `16*1`($inp),$inout1 17865cdd308eSdjm pxor $twres,@tweak[5] 17875cdd308eSdjm 17885cdd308eSdjm movdqu `16*2`($inp),$inout2 17895cdd308eSdjm pxor @tweak[0],$inout0 # input^=tweak 17905cdd308eSdjm movdqu `16*3`($inp),$inout3 17915cdd308eSdjm pxor @tweak[1],$inout1 17925cdd308eSdjm movdqu `16*4`($inp),$inout4 17935cdd308eSdjm pxor @tweak[2],$inout2 17945cdd308eSdjm movdqu `16*5`($inp),$inout5 17955cdd308eSdjm lea `16*6`($inp),$inp 17965cdd308eSdjm pxor @tweak[3],$inout3 17975cdd308eSdjm $movkey ($key_),$rndkey0 17985cdd308eSdjm pxor @tweak[4],$inout4 17995cdd308eSdjm pxor @tweak[5],$inout5 18005cdd308eSdjm 18015cdd308eSdjm # inline _aesni_decrypt6 and interleave first and last rounds 18025cdd308eSdjm # with own code... 18035cdd308eSdjm $movkey 16($key_),$rndkey1 18045cdd308eSdjm pxor $rndkey0,$inout0 18055cdd308eSdjm pxor $rndkey0,$inout1 18065cdd308eSdjm movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks 18075cdd308eSdjm aesdec $rndkey1,$inout0 18085cdd308eSdjm lea 32($key_),$key 18095cdd308eSdjm pxor $rndkey0,$inout2 18105cdd308eSdjm movdqa @tweak[1],`16*1`(%rsp) 18115cdd308eSdjm aesdec $rndkey1,$inout1 18125cdd308eSdjm pxor $rndkey0,$inout3 18135cdd308eSdjm movdqa @tweak[2],`16*2`(%rsp) 18145cdd308eSdjm aesdec $rndkey1,$inout2 18155cdd308eSdjm pxor $rndkey0,$inout4 18165cdd308eSdjm movdqa @tweak[3],`16*3`(%rsp) 18175cdd308eSdjm aesdec $rndkey1,$inout3 18185cdd308eSdjm pxor $rndkey0,$inout5 18195cdd308eSdjm $movkey ($key),$rndkey0 18205cdd308eSdjm dec $rounds 18215cdd308eSdjm movdqa @tweak[4],`16*4`(%rsp) 18225cdd308eSdjm aesdec $rndkey1,$inout4 18235cdd308eSdjm movdqa @tweak[5],`16*5`(%rsp) 18245cdd308eSdjm aesdec $rndkey1,$inout5 18255cdd308eSdjm pxor $twtmp,$twtmp 18265cdd308eSdjm pcmpgtd @tweak[5],$twtmp 18275cdd308eSdjm jmp .Lxts_dec_loop6_enter 18285cdd308eSdjm 18295cdd308eSdjm.align 16 18305cdd308eSdjm.Lxts_dec_loop6: 18315cdd308eSdjm aesdec $rndkey1,$inout0 18325cdd308eSdjm aesdec $rndkey1,$inout1 18335cdd308eSdjm dec $rounds 18345cdd308eSdjm aesdec $rndkey1,$inout2 18355cdd308eSdjm aesdec $rndkey1,$inout3 18365cdd308eSdjm aesdec $rndkey1,$inout4 18375cdd308eSdjm aesdec $rndkey1,$inout5 18385cdd308eSdjm.Lxts_dec_loop6_enter: 18395cdd308eSdjm $movkey 16($key),$rndkey1 18405cdd308eSdjm aesdec $rndkey0,$inout0 18415cdd308eSdjm aesdec $rndkey0,$inout1 18425cdd308eSdjm lea 32($key),$key 18435cdd308eSdjm aesdec $rndkey0,$inout2 18445cdd308eSdjm aesdec $rndkey0,$inout3 18455cdd308eSdjm aesdec $rndkey0,$inout4 18465cdd308eSdjm aesdec $rndkey0,$inout5 18475cdd308eSdjm $movkey ($key),$rndkey0 18485cdd308eSdjm jnz .Lxts_dec_loop6 18495cdd308eSdjm 18505cdd308eSdjm pshufd \$0x13,$twtmp,$twres 18515cdd308eSdjm pxor $twtmp,$twtmp 18525cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 18535cdd308eSdjm aesdec $rndkey1,$inout0 18545cdd308eSdjm pand $twmask,$twres # isolate carry and residue 18555cdd308eSdjm aesdec $rndkey1,$inout1 18565cdd308eSdjm pcmpgtd @tweak[5],$twtmp # broadcast upper bits 18575cdd308eSdjm aesdec $rndkey1,$inout2 18585cdd308eSdjm pxor $twres,@tweak[5] 18595cdd308eSdjm aesdec $rndkey1,$inout3 18605cdd308eSdjm aesdec $rndkey1,$inout4 18615cdd308eSdjm aesdec $rndkey1,$inout5 18625cdd308eSdjm $movkey 16($key),$rndkey1 18635cdd308eSdjm 18645cdd308eSdjm pshufd \$0x13,$twtmp,$twres 18655cdd308eSdjm pxor $twtmp,$twtmp 18665cdd308eSdjm movdqa @tweak[5],@tweak[0] 18675cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 18685cdd308eSdjm aesdec $rndkey0,$inout0 18695cdd308eSdjm pand $twmask,$twres # isolate carry and residue 18705cdd308eSdjm aesdec $rndkey0,$inout1 187171743258Sjmc pcmpgtd @tweak[5],$twtmp # broadcast upper bits 18725cdd308eSdjm aesdec $rndkey0,$inout2 18735cdd308eSdjm pxor $twres,@tweak[5] 18745cdd308eSdjm aesdec $rndkey0,$inout3 18755cdd308eSdjm aesdec $rndkey0,$inout4 18765cdd308eSdjm aesdec $rndkey0,$inout5 18775cdd308eSdjm $movkey 32($key),$rndkey0 18785cdd308eSdjm 18795cdd308eSdjm pshufd \$0x13,$twtmp,$twres 18805cdd308eSdjm pxor $twtmp,$twtmp 18815cdd308eSdjm movdqa @tweak[5],@tweak[1] 18825cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 18835cdd308eSdjm aesdec $rndkey1,$inout0 18845cdd308eSdjm pand $twmask,$twres # isolate carry and residue 18855cdd308eSdjm aesdec $rndkey1,$inout1 188671743258Sjmc pcmpgtd @tweak[5],$twtmp # broadcast upper bits 18875cdd308eSdjm aesdec $rndkey1,$inout2 18885cdd308eSdjm pxor $twres,@tweak[5] 18895cdd308eSdjm aesdec $rndkey1,$inout3 18905cdd308eSdjm aesdec $rndkey1,$inout4 18915cdd308eSdjm aesdec $rndkey1,$inout5 18925cdd308eSdjm 18935cdd308eSdjm pshufd \$0x13,$twtmp,$twres 18945cdd308eSdjm pxor $twtmp,$twtmp 18955cdd308eSdjm movdqa @tweak[5],@tweak[2] 18965cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 18975cdd308eSdjm aesdeclast $rndkey0,$inout0 18985cdd308eSdjm pand $twmask,$twres # isolate carry and residue 18995cdd308eSdjm aesdeclast $rndkey0,$inout1 190071743258Sjmc pcmpgtd @tweak[5],$twtmp # broadcast upper bits 19015cdd308eSdjm aesdeclast $rndkey0,$inout2 19025cdd308eSdjm pxor $twres,@tweak[5] 19035cdd308eSdjm aesdeclast $rndkey0,$inout3 19045cdd308eSdjm aesdeclast $rndkey0,$inout4 19055cdd308eSdjm aesdeclast $rndkey0,$inout5 19065cdd308eSdjm 19075cdd308eSdjm pshufd \$0x13,$twtmp,$twres 19085cdd308eSdjm pxor $twtmp,$twtmp 19095cdd308eSdjm movdqa @tweak[5],@tweak[3] 19105cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 19115cdd308eSdjm xorps `16*0`(%rsp),$inout0 # output^=tweak 19125cdd308eSdjm pand $twmask,$twres # isolate carry and residue 19135cdd308eSdjm xorps `16*1`(%rsp),$inout1 191471743258Sjmc pcmpgtd @tweak[5],$twtmp # broadcast upper bits 19155cdd308eSdjm pxor $twres,@tweak[5] 19165cdd308eSdjm 19175cdd308eSdjm xorps `16*2`(%rsp),$inout2 19185cdd308eSdjm movups $inout0,`16*0`($out) # write output 19195cdd308eSdjm xorps `16*3`(%rsp),$inout3 19205cdd308eSdjm movups $inout1,`16*1`($out) 19215cdd308eSdjm xorps `16*4`(%rsp),$inout4 19225cdd308eSdjm movups $inout2,`16*2`($out) 19235cdd308eSdjm xorps `16*5`(%rsp),$inout5 19245cdd308eSdjm movups $inout3,`16*3`($out) 19255cdd308eSdjm mov $rnds_,$rounds # restore $rounds 19265cdd308eSdjm movups $inout4,`16*4`($out) 19275cdd308eSdjm movups $inout5,`16*5`($out) 19285cdd308eSdjm lea `16*6`($out),$out 19295cdd308eSdjm sub \$16*6,$len 19305cdd308eSdjm jnc .Lxts_dec_grandloop 19315cdd308eSdjm 19325cdd308eSdjm lea 3($rounds,$rounds),$rounds # restore original value 19335cdd308eSdjm mov $key_,$key # restore $key 19345cdd308eSdjm mov $rounds,$rnds_ # backup $rounds 19355cdd308eSdjm 19365cdd308eSdjm.Lxts_dec_short: 19375cdd308eSdjm add \$16*6,$len 19385cdd308eSdjm jz .Lxts_dec_done 19395cdd308eSdjm 19405cdd308eSdjm cmp \$0x20,$len 19415cdd308eSdjm jb .Lxts_dec_one 19425cdd308eSdjm je .Lxts_dec_two 19435cdd308eSdjm 19445cdd308eSdjm cmp \$0x40,$len 19455cdd308eSdjm jb .Lxts_dec_three 19465cdd308eSdjm je .Lxts_dec_four 19475cdd308eSdjm 19485cdd308eSdjm pshufd \$0x13,$twtmp,$twres 19495cdd308eSdjm movdqa @tweak[5],@tweak[4] 19505cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 19515cdd308eSdjm movdqu ($inp),$inout0 19525cdd308eSdjm pand $twmask,$twres # isolate carry and residue 19535cdd308eSdjm movdqu 16*1($inp),$inout1 19545cdd308eSdjm pxor $twres,@tweak[5] 19555cdd308eSdjm 19565cdd308eSdjm movdqu 16*2($inp),$inout2 19575cdd308eSdjm pxor @tweak[0],$inout0 19585cdd308eSdjm movdqu 16*3($inp),$inout3 19595cdd308eSdjm pxor @tweak[1],$inout1 19605cdd308eSdjm movdqu 16*4($inp),$inout4 19615cdd308eSdjm lea 16*5($inp),$inp 19625cdd308eSdjm pxor @tweak[2],$inout2 19635cdd308eSdjm pxor @tweak[3],$inout3 19645cdd308eSdjm pxor @tweak[4],$inout4 19655cdd308eSdjm 19665cdd308eSdjm call _aesni_decrypt6 19675cdd308eSdjm 19685cdd308eSdjm xorps @tweak[0],$inout0 19695cdd308eSdjm xorps @tweak[1],$inout1 19705cdd308eSdjm xorps @tweak[2],$inout2 19715cdd308eSdjm movdqu $inout0,($out) 19725cdd308eSdjm xorps @tweak[3],$inout3 19735cdd308eSdjm movdqu $inout1,16*1($out) 19745cdd308eSdjm xorps @tweak[4],$inout4 19755cdd308eSdjm movdqu $inout2,16*2($out) 19765cdd308eSdjm pxor $twtmp,$twtmp 19775cdd308eSdjm movdqu $inout3,16*3($out) 19785cdd308eSdjm pcmpgtd @tweak[5],$twtmp 19795cdd308eSdjm movdqu $inout4,16*4($out) 19805cdd308eSdjm lea 16*5($out),$out 19815cdd308eSdjm pshufd \$0x13,$twtmp,@tweak[1] # $twres 19825cdd308eSdjm and \$15,$len_ 19835cdd308eSdjm jz .Lxts_dec_ret 19845cdd308eSdjm 19855cdd308eSdjm movdqa @tweak[5],@tweak[0] 19865cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 19875cdd308eSdjm pand $twmask,@tweak[1] # isolate carry and residue 19885cdd308eSdjm pxor @tweak[5],@tweak[1] 19895cdd308eSdjm jmp .Lxts_dec_done2 19905cdd308eSdjm 19915cdd308eSdjm.align 16 19925cdd308eSdjm.Lxts_dec_one: 19935cdd308eSdjm movups ($inp),$inout0 19945cdd308eSdjm lea 16*1($inp),$inp 19955cdd308eSdjm xorps @tweak[0],$inout0 19965cdd308eSdjm___ 19975cdd308eSdjm &aesni_generate1("dec",$key,$rounds); 19985cdd308eSdjm$code.=<<___; 19995cdd308eSdjm xorps @tweak[0],$inout0 20005cdd308eSdjm movdqa @tweak[1],@tweak[0] 20015cdd308eSdjm movups $inout0,($out) 20025cdd308eSdjm movdqa @tweak[2],@tweak[1] 20035cdd308eSdjm lea 16*1($out),$out 20045cdd308eSdjm jmp .Lxts_dec_done 20055cdd308eSdjm 20065cdd308eSdjm.align 16 20075cdd308eSdjm.Lxts_dec_two: 20085cdd308eSdjm movups ($inp),$inout0 20095cdd308eSdjm movups 16($inp),$inout1 20105cdd308eSdjm lea 32($inp),$inp 20115cdd308eSdjm xorps @tweak[0],$inout0 20125cdd308eSdjm xorps @tweak[1],$inout1 20135cdd308eSdjm 20145cdd308eSdjm call _aesni_decrypt3 20155cdd308eSdjm 20165cdd308eSdjm xorps @tweak[0],$inout0 20175cdd308eSdjm movdqa @tweak[2],@tweak[0] 20185cdd308eSdjm xorps @tweak[1],$inout1 20195cdd308eSdjm movdqa @tweak[3],@tweak[1] 20205cdd308eSdjm movups $inout0,($out) 20215cdd308eSdjm movups $inout1,16*1($out) 20225cdd308eSdjm lea 16*2($out),$out 20235cdd308eSdjm jmp .Lxts_dec_done 20245cdd308eSdjm 20255cdd308eSdjm.align 16 20265cdd308eSdjm.Lxts_dec_three: 20275cdd308eSdjm movups ($inp),$inout0 20285cdd308eSdjm movups 16*1($inp),$inout1 20295cdd308eSdjm movups 16*2($inp),$inout2 20305cdd308eSdjm lea 16*3($inp),$inp 20315cdd308eSdjm xorps @tweak[0],$inout0 20325cdd308eSdjm xorps @tweak[1],$inout1 20335cdd308eSdjm xorps @tweak[2],$inout2 20345cdd308eSdjm 20355cdd308eSdjm call _aesni_decrypt3 20365cdd308eSdjm 20375cdd308eSdjm xorps @tweak[0],$inout0 20385cdd308eSdjm movdqa @tweak[3],@tweak[0] 20395cdd308eSdjm xorps @tweak[1],$inout1 20405cdd308eSdjm movdqa @tweak[5],@tweak[1] 20415cdd308eSdjm xorps @tweak[2],$inout2 20425cdd308eSdjm movups $inout0,($out) 20435cdd308eSdjm movups $inout1,16*1($out) 20445cdd308eSdjm movups $inout2,16*2($out) 20455cdd308eSdjm lea 16*3($out),$out 20465cdd308eSdjm jmp .Lxts_dec_done 20475cdd308eSdjm 20485cdd308eSdjm.align 16 20495cdd308eSdjm.Lxts_dec_four: 20505cdd308eSdjm pshufd \$0x13,$twtmp,$twres 20515cdd308eSdjm movdqa @tweak[5],@tweak[4] 20525cdd308eSdjm paddq @tweak[5],@tweak[5] # psllq 1,$tweak 20535cdd308eSdjm movups ($inp),$inout0 20545cdd308eSdjm pand $twmask,$twres # isolate carry and residue 20555cdd308eSdjm movups 16*1($inp),$inout1 20565cdd308eSdjm pxor $twres,@tweak[5] 20575cdd308eSdjm 20585cdd308eSdjm movups 16*2($inp),$inout2 20595cdd308eSdjm xorps @tweak[0],$inout0 20605cdd308eSdjm movups 16*3($inp),$inout3 20615cdd308eSdjm lea 16*4($inp),$inp 20625cdd308eSdjm xorps @tweak[1],$inout1 20635cdd308eSdjm xorps @tweak[2],$inout2 20645cdd308eSdjm xorps @tweak[3],$inout3 20655cdd308eSdjm 20665cdd308eSdjm call _aesni_decrypt4 20675cdd308eSdjm 20685cdd308eSdjm xorps @tweak[0],$inout0 20695cdd308eSdjm movdqa @tweak[4],@tweak[0] 20705cdd308eSdjm xorps @tweak[1],$inout1 20715cdd308eSdjm movdqa @tweak[5],@tweak[1] 20725cdd308eSdjm xorps @tweak[2],$inout2 20735cdd308eSdjm movups $inout0,($out) 20745cdd308eSdjm xorps @tweak[3],$inout3 20755cdd308eSdjm movups $inout1,16*1($out) 20765cdd308eSdjm movups $inout2,16*2($out) 20775cdd308eSdjm movups $inout3,16*3($out) 20785cdd308eSdjm lea 16*4($out),$out 20795cdd308eSdjm jmp .Lxts_dec_done 20805cdd308eSdjm 20815cdd308eSdjm.align 16 20825cdd308eSdjm.Lxts_dec_done: 20835cdd308eSdjm and \$15,$len_ 20845cdd308eSdjm jz .Lxts_dec_ret 20855cdd308eSdjm.Lxts_dec_done2: 20865cdd308eSdjm mov $len_,$len 20875cdd308eSdjm mov $key_,$key # restore $key 20885cdd308eSdjm mov $rnds_,$rounds # restore $rounds 20895cdd308eSdjm 20905cdd308eSdjm movups ($inp),$inout0 20915cdd308eSdjm xorps @tweak[1],$inout0 20925cdd308eSdjm___ 20935cdd308eSdjm &aesni_generate1("dec",$key,$rounds); 20945cdd308eSdjm$code.=<<___; 20955cdd308eSdjm xorps @tweak[1],$inout0 20965cdd308eSdjm movups $inout0,($out) 20975cdd308eSdjm 20985cdd308eSdjm.Lxts_dec_steal: 20995cdd308eSdjm movzb 16($inp),%eax # borrow $rounds ... 21005cdd308eSdjm movzb ($out),%ecx # ... and $key 21015cdd308eSdjm lea 1($inp),$inp 21025cdd308eSdjm mov %al,($out) 21035cdd308eSdjm mov %cl,16($out) 21045cdd308eSdjm lea 1($out),$out 21055cdd308eSdjm sub \$1,$len 21065cdd308eSdjm jnz .Lxts_dec_steal 21075cdd308eSdjm 21085cdd308eSdjm sub $len_,$out # rewind $out 21095cdd308eSdjm mov $key_,$key # restore $key 21105cdd308eSdjm mov $rnds_,$rounds # restore $rounds 21115cdd308eSdjm 21125cdd308eSdjm movups ($out),$inout0 21135cdd308eSdjm xorps @tweak[0],$inout0 21145cdd308eSdjm___ 21155cdd308eSdjm &aesni_generate1("dec",$key,$rounds); 21165cdd308eSdjm$code.=<<___; 21175cdd308eSdjm xorps @tweak[0],$inout0 21185cdd308eSdjm movups $inout0,($out) 21195cdd308eSdjm 21205cdd308eSdjm.Lxts_dec_ret: 21215cdd308eSdjm___ 21225cdd308eSdjm$code.=<<___ if ($win64); 21235cdd308eSdjm movaps 0x60(%rsp),%xmm6 21245cdd308eSdjm movaps 0x70(%rsp),%xmm7 21255cdd308eSdjm movaps 0x80(%rsp),%xmm8 21265cdd308eSdjm movaps 0x90(%rsp),%xmm9 21275cdd308eSdjm movaps 0xa0(%rsp),%xmm10 21285cdd308eSdjm movaps 0xb0(%rsp),%xmm11 21295cdd308eSdjm movaps 0xc0(%rsp),%xmm12 21305cdd308eSdjm movaps 0xd0(%rsp),%xmm13 21315cdd308eSdjm movaps 0xe0(%rsp),%xmm14 21325cdd308eSdjm movaps 0xf0(%rsp),%xmm15 21335cdd308eSdjm___ 21345cdd308eSdjm$code.=<<___; 21355c104365Sjsing lea (%rbp),%rsp 21365c104365Sjsing pop %rbp 21375cdd308eSdjm.Lxts_dec_epilogue: 21385cdd308eSdjm ret 21395cdd308eSdjm.size aesni_xts_decrypt,.-aesni_xts_decrypt 21405cdd308eSdjm___ 21415cdd308eSdjm} }} 21425cdd308eSdjm 21435cdd308eSdjm######################################################################## 21446249468aSthib# void $PREFIX_cbc_encrypt (const void *inp, void *out, 21456249468aSthib# size_t length, const AES_KEY *key, 21466249468aSthib# unsigned char *ivp,const int enc); 21475cdd308eSdjm{ 21485c104365Sjsingmy $frame_size = 0x10 + ($win64?0x40:0); # used in decrypt 21496249468aSthib$code.=<<___; 21506249468aSthib.globl ${PREFIX}_cbc_encrypt 21516249468aSthib.type ${PREFIX}_cbc_encrypt,\@function,6 21526249468aSthib.align 16 21536249468aSthib${PREFIX}_cbc_encrypt: 2154*5caf18b2Stb _CET_ENDBR 21556249468aSthib test $len,$len # check length 21566249468aSthib jz .Lcbc_ret 21576249468aSthib 21585cdd308eSdjm mov 240($key),$rnds_ # key->rounds 21596249468aSthib mov $key,$key_ # backup $key 21606249468aSthib test %r9d,%r9d # 6th argument 21616249468aSthib jz .Lcbc_decrypt 21626249468aSthib#--------------------------- CBC ENCRYPT ------------------------------# 21636249468aSthib movups ($ivp),$inout0 # load iv as initial state 21646249468aSthib mov $rnds_,$rounds 21655cdd308eSdjm cmp \$16,$len 21666249468aSthib jb .Lcbc_enc_tail 21676249468aSthib sub \$16,$len 21686249468aSthib jmp .Lcbc_enc_loop 21696249468aSthib.align 16 21706249468aSthib.Lcbc_enc_loop: 21716249468aSthib movups ($inp),$inout1 # load input 21726249468aSthib lea 16($inp),$inp 21735cdd308eSdjm #xorps $inout1,$inout0 21746249468aSthib___ 21755cdd308eSdjm &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 21766249468aSthib$code.=<<___; 21776249468aSthib mov $rnds_,$rounds # restore $rounds 21786249468aSthib mov $key_,$key # restore $key 21795cdd308eSdjm movups $inout0,0($out) # store output 21805cdd308eSdjm lea 16($out),$out 21815cdd308eSdjm sub \$16,$len 21826249468aSthib jnc .Lcbc_enc_loop 21836249468aSthib add \$16,$len 21846249468aSthib jnz .Lcbc_enc_tail 21856249468aSthib movups $inout0,($ivp) 21866249468aSthib jmp .Lcbc_ret 21876249468aSthib 21886249468aSthib.Lcbc_enc_tail: 21896249468aSthib mov $len,%rcx # zaps $key 21906249468aSthib xchg $inp,$out # $inp is %rsi and $out is %rdi now 21916249468aSthib .long 0x9066A4F3 # rep movsb 21926249468aSthib mov \$16,%ecx # zero tail 21936249468aSthib sub $len,%rcx 21946249468aSthib xor %eax,%eax 21956249468aSthib .long 0x9066AAF3 # rep stosb 21966249468aSthib lea -16(%rdi),%rdi # rewind $out by 1 block 21976249468aSthib mov $rnds_,$rounds # restore $rounds 21986249468aSthib mov %rdi,%rsi # $inp and $out are the same 21996249468aSthib mov $key_,$key # restore $key 22006249468aSthib xor $len,$len # len=16 22016249468aSthib jmp .Lcbc_enc_loop # one more spin 22026249468aSthib#--------------------------- CBC DECRYPT ------------------------------# 22036249468aSthib.align 16 22046249468aSthib.Lcbc_decrypt: 22055c104365Sjsing lea (%rsp),%rax 22065c104365Sjsing push %rbp 22075c104365Sjsing sub \$$frame_size,%rsp 22086249468aSthib___ 22096249468aSthib$code.=<<___ if ($win64); 22105c104365Sjsing movaps %xmm6,0x10(%rsp) 22115c104365Sjsing movaps %xmm7,0x20(%rsp) 22125c104365Sjsing movaps %xmm8,0x30(%rsp) 22135c104365Sjsing movaps %xmm9,0x40(%rsp) 22146249468aSthib.Lcbc_decrypt_body: 22156249468aSthib___ 22166249468aSthib$code.=<<___; 22175c104365Sjsing lea -8(%rax),%rbp 22186249468aSthib movups ($ivp),$iv 22196249468aSthib mov $rnds_,$rounds 22205cdd308eSdjm cmp \$0x70,$len 22216249468aSthib jbe .Lcbc_dec_tail 22225cdd308eSdjm shr \$1,$rnds_ 22235cdd308eSdjm sub \$0x70,$len 22245cdd308eSdjm mov $rnds_,$rounds 22255c104365Sjsing movaps $iv,(%rsp) 22265cdd308eSdjm jmp .Lcbc_dec_loop8_enter 22276249468aSthib.align 16 22285cdd308eSdjm.Lcbc_dec_loop8: 22295c104365Sjsing movaps $rndkey0,(%rsp) # save IV 22305cdd308eSdjm movups $inout7,($out) 22315cdd308eSdjm lea 0x10($out),$out 22325cdd308eSdjm.Lcbc_dec_loop8_enter: 22335cdd308eSdjm $movkey ($key),$rndkey0 22345cdd308eSdjm movups ($inp),$inout0 # load input 22356249468aSthib movups 0x10($inp),$inout1 22365cdd308eSdjm $movkey 16($key),$rndkey1 22376249468aSthib 22385cdd308eSdjm lea 32($key),$key 22395cdd308eSdjm movdqu 0x20($inp),$inout2 22405cdd308eSdjm xorps $rndkey0,$inout0 22415cdd308eSdjm movdqu 0x30($inp),$inout3 22425cdd308eSdjm xorps $rndkey0,$inout1 22435cdd308eSdjm movdqu 0x40($inp),$inout4 22445cdd308eSdjm aesdec $rndkey1,$inout0 22455cdd308eSdjm pxor $rndkey0,$inout2 22465cdd308eSdjm movdqu 0x50($inp),$inout5 22475cdd308eSdjm aesdec $rndkey1,$inout1 22485cdd308eSdjm pxor $rndkey0,$inout3 22495cdd308eSdjm movdqu 0x60($inp),$inout6 22505cdd308eSdjm aesdec $rndkey1,$inout2 22515cdd308eSdjm pxor $rndkey0,$inout4 22525cdd308eSdjm movdqu 0x70($inp),$inout7 22535cdd308eSdjm aesdec $rndkey1,$inout3 22545cdd308eSdjm pxor $rndkey0,$inout5 22555cdd308eSdjm dec $rounds 22565cdd308eSdjm aesdec $rndkey1,$inout4 22575cdd308eSdjm pxor $rndkey0,$inout6 22585cdd308eSdjm aesdec $rndkey1,$inout5 22595cdd308eSdjm pxor $rndkey0,$inout7 22605cdd308eSdjm $movkey ($key),$rndkey0 22615cdd308eSdjm aesdec $rndkey1,$inout6 22625cdd308eSdjm aesdec $rndkey1,$inout7 22635cdd308eSdjm $movkey 16($key),$rndkey1 22646249468aSthib 22655cdd308eSdjm call .Ldec_loop8_enter 22665cdd308eSdjm 22675cdd308eSdjm movups ($inp),$rndkey1 # re-load input 22685cdd308eSdjm movups 0x10($inp),$rndkey0 22695c104365Sjsing xorps (%rsp),$inout0 # ^= IV 22705cdd308eSdjm xorps $rndkey1,$inout1 22715cdd308eSdjm movups 0x20($inp),$rndkey1 22725cdd308eSdjm xorps $rndkey0,$inout2 22735cdd308eSdjm movups 0x30($inp),$rndkey0 22745cdd308eSdjm xorps $rndkey1,$inout3 22755cdd308eSdjm movups 0x40($inp),$rndkey1 22765cdd308eSdjm xorps $rndkey0,$inout4 22775cdd308eSdjm movups 0x50($inp),$rndkey0 22785cdd308eSdjm xorps $rndkey1,$inout5 22795cdd308eSdjm movups 0x60($inp),$rndkey1 22805cdd308eSdjm xorps $rndkey0,$inout6 22815cdd308eSdjm movups 0x70($inp),$rndkey0 # IV 22825cdd308eSdjm xorps $rndkey1,$inout7 22836249468aSthib movups $inout0,($out) 22846249468aSthib movups $inout1,0x10($out) 22856249468aSthib movups $inout2,0x20($out) 22865cdd308eSdjm movups $inout3,0x30($out) 22875cdd308eSdjm mov $rnds_,$rounds # restore $rounds 22885cdd308eSdjm movups $inout4,0x40($out) 22895cdd308eSdjm mov $key_,$key # restore $key 22905cdd308eSdjm movups $inout5,0x50($out) 22915cdd308eSdjm lea 0x80($inp),$inp 22925cdd308eSdjm movups $inout6,0x60($out) 22935cdd308eSdjm lea 0x70($out),$out 22945cdd308eSdjm sub \$0x80,$len 22955cdd308eSdjm ja .Lcbc_dec_loop8 22965cdd308eSdjm 22975cdd308eSdjm movaps $inout7,$inout0 22985cdd308eSdjm movaps $rndkey0,$iv 22995cdd308eSdjm add \$0x70,$len 23005cdd308eSdjm jle .Lcbc_dec_tail_collected 23015cdd308eSdjm movups $inout0,($out) 23025cdd308eSdjm lea 1($rnds_,$rnds_),$rounds 23035cdd308eSdjm lea 0x10($out),$out 23045cdd308eSdjm.Lcbc_dec_tail: 23055cdd308eSdjm movups ($inp),$inout0 23065cdd308eSdjm movaps $inout0,$in0 23075cdd308eSdjm cmp \$0x10,$len 23085cdd308eSdjm jbe .Lcbc_dec_one 23095cdd308eSdjm 23105cdd308eSdjm movups 0x10($inp),$inout1 23115cdd308eSdjm movaps $inout1,$in1 23125cdd308eSdjm cmp \$0x20,$len 23135cdd308eSdjm jbe .Lcbc_dec_two 23145cdd308eSdjm 23155cdd308eSdjm movups 0x20($inp),$inout2 23165cdd308eSdjm movaps $inout2,$in2 23175cdd308eSdjm cmp \$0x30,$len 23185cdd308eSdjm jbe .Lcbc_dec_three 23195cdd308eSdjm 23205cdd308eSdjm movups 0x30($inp),$inout3 23215cdd308eSdjm cmp \$0x40,$len 23225cdd308eSdjm jbe .Lcbc_dec_four 23235cdd308eSdjm 23245cdd308eSdjm movups 0x40($inp),$inout4 23255cdd308eSdjm cmp \$0x50,$len 23265cdd308eSdjm jbe .Lcbc_dec_five 23275cdd308eSdjm 23285cdd308eSdjm movups 0x50($inp),$inout5 23295cdd308eSdjm cmp \$0x60,$len 23305cdd308eSdjm jbe .Lcbc_dec_six 23315cdd308eSdjm 23325cdd308eSdjm movups 0x60($inp),$inout6 23335c104365Sjsing movaps $iv,(%rsp) # save IV 23345cdd308eSdjm call _aesni_decrypt8 23355cdd308eSdjm movups ($inp),$rndkey1 23365cdd308eSdjm movups 0x10($inp),$rndkey0 23375c104365Sjsing xorps (%rsp),$inout0 # ^= IV 23385cdd308eSdjm xorps $rndkey1,$inout1 23395cdd308eSdjm movups 0x20($inp),$rndkey1 23405cdd308eSdjm xorps $rndkey0,$inout2 23415cdd308eSdjm movups 0x30($inp),$rndkey0 23425cdd308eSdjm xorps $rndkey1,$inout3 23435cdd308eSdjm movups 0x40($inp),$rndkey1 23445cdd308eSdjm xorps $rndkey0,$inout4 23455cdd308eSdjm movups 0x50($inp),$rndkey0 23465cdd308eSdjm xorps $rndkey1,$inout5 23475cdd308eSdjm movups 0x60($inp),$iv # IV 23485cdd308eSdjm xorps $rndkey0,$inout6 23495cdd308eSdjm movups $inout0,($out) 23505cdd308eSdjm movups $inout1,0x10($out) 23515cdd308eSdjm movups $inout2,0x20($out) 23525cdd308eSdjm movups $inout3,0x30($out) 23535cdd308eSdjm movups $inout4,0x40($out) 23545cdd308eSdjm movups $inout5,0x50($out) 23555cdd308eSdjm lea 0x60($out),$out 23565cdd308eSdjm movaps $inout6,$inout0 23575cdd308eSdjm sub \$0x70,$len 23586249468aSthib jmp .Lcbc_dec_tail_collected 23596249468aSthib.align 16 23606249468aSthib.Lcbc_dec_one: 23616249468aSthib___ 23626249468aSthib &aesni_generate1("dec",$key,$rounds); 23636249468aSthib$code.=<<___; 23645cdd308eSdjm xorps $iv,$inout0 23656249468aSthib movaps $in0,$iv 23665cdd308eSdjm sub \$0x10,$len 23676249468aSthib jmp .Lcbc_dec_tail_collected 23686249468aSthib.align 16 23696249468aSthib.Lcbc_dec_two: 23705cdd308eSdjm xorps $inout2,$inout2 23716249468aSthib call _aesni_decrypt3 23725cdd308eSdjm xorps $iv,$inout0 23735cdd308eSdjm xorps $in0,$inout1 23746249468aSthib movups $inout0,($out) 23756249468aSthib movaps $in1,$iv 23766249468aSthib movaps $inout1,$inout0 23776249468aSthib lea 0x10($out),$out 23785cdd308eSdjm sub \$0x20,$len 23796249468aSthib jmp .Lcbc_dec_tail_collected 23806249468aSthib.align 16 23816249468aSthib.Lcbc_dec_three: 23826249468aSthib call _aesni_decrypt3 23835cdd308eSdjm xorps $iv,$inout0 23845cdd308eSdjm xorps $in0,$inout1 23856249468aSthib movups $inout0,($out) 23865cdd308eSdjm xorps $in1,$inout2 23876249468aSthib movups $inout1,0x10($out) 23886249468aSthib movaps $in2,$iv 23896249468aSthib movaps $inout2,$inout0 23906249468aSthib lea 0x20($out),$out 23915cdd308eSdjm sub \$0x30,$len 23925cdd308eSdjm jmp .Lcbc_dec_tail_collected 23935cdd308eSdjm.align 16 23945cdd308eSdjm.Lcbc_dec_four: 23955cdd308eSdjm call _aesni_decrypt4 23965cdd308eSdjm xorps $iv,$inout0 23975cdd308eSdjm movups 0x30($inp),$iv 23985cdd308eSdjm xorps $in0,$inout1 23995cdd308eSdjm movups $inout0,($out) 24005cdd308eSdjm xorps $in1,$inout2 24015cdd308eSdjm movups $inout1,0x10($out) 24025cdd308eSdjm xorps $in2,$inout3 24035cdd308eSdjm movups $inout2,0x20($out) 24045cdd308eSdjm movaps $inout3,$inout0 24055cdd308eSdjm lea 0x30($out),$out 24065cdd308eSdjm sub \$0x40,$len 24075cdd308eSdjm jmp .Lcbc_dec_tail_collected 24085cdd308eSdjm.align 16 24095cdd308eSdjm.Lcbc_dec_five: 24105cdd308eSdjm xorps $inout5,$inout5 24115cdd308eSdjm call _aesni_decrypt6 24125cdd308eSdjm movups 0x10($inp),$rndkey1 24135cdd308eSdjm movups 0x20($inp),$rndkey0 24145cdd308eSdjm xorps $iv,$inout0 24155cdd308eSdjm xorps $in0,$inout1 24165cdd308eSdjm xorps $rndkey1,$inout2 24175cdd308eSdjm movups 0x30($inp),$rndkey1 24185cdd308eSdjm xorps $rndkey0,$inout3 24195cdd308eSdjm movups 0x40($inp),$iv 24205cdd308eSdjm xorps $rndkey1,$inout4 24215cdd308eSdjm movups $inout0,($out) 24225cdd308eSdjm movups $inout1,0x10($out) 24235cdd308eSdjm movups $inout2,0x20($out) 24245cdd308eSdjm movups $inout3,0x30($out) 24255cdd308eSdjm lea 0x40($out),$out 24265cdd308eSdjm movaps $inout4,$inout0 24275cdd308eSdjm sub \$0x50,$len 24285cdd308eSdjm jmp .Lcbc_dec_tail_collected 24295cdd308eSdjm.align 16 24305cdd308eSdjm.Lcbc_dec_six: 24315cdd308eSdjm call _aesni_decrypt6 24325cdd308eSdjm movups 0x10($inp),$rndkey1 24335cdd308eSdjm movups 0x20($inp),$rndkey0 24345cdd308eSdjm xorps $iv,$inout0 24355cdd308eSdjm xorps $in0,$inout1 24365cdd308eSdjm xorps $rndkey1,$inout2 24375cdd308eSdjm movups 0x30($inp),$rndkey1 24385cdd308eSdjm xorps $rndkey0,$inout3 24395cdd308eSdjm movups 0x40($inp),$rndkey0 24405cdd308eSdjm xorps $rndkey1,$inout4 24415cdd308eSdjm movups 0x50($inp),$iv 24425cdd308eSdjm xorps $rndkey0,$inout5 24435cdd308eSdjm movups $inout0,($out) 24445cdd308eSdjm movups $inout1,0x10($out) 24455cdd308eSdjm movups $inout2,0x20($out) 24465cdd308eSdjm movups $inout3,0x30($out) 24475cdd308eSdjm movups $inout4,0x40($out) 24485cdd308eSdjm lea 0x50($out),$out 24495cdd308eSdjm movaps $inout5,$inout0 24505cdd308eSdjm sub \$0x60,$len 24516249468aSthib jmp .Lcbc_dec_tail_collected 24526249468aSthib.align 16 24536249468aSthib.Lcbc_dec_tail_collected: 24546249468aSthib and \$15,$len 24556249468aSthib movups $iv,($ivp) 24566249468aSthib jnz .Lcbc_dec_tail_partial 24576249468aSthib movups $inout0,($out) 24586249468aSthib jmp .Lcbc_dec_ret 24595cdd308eSdjm.align 16 24606249468aSthib.Lcbc_dec_tail_partial: 24615c104365Sjsing movaps $inout0,(%rsp) 24625cdd308eSdjm mov \$16,%rcx 24636249468aSthib mov $out,%rdi 24645cdd308eSdjm sub $len,%rcx 24655c104365Sjsing lea (%rsp),%rsi 24666249468aSthib .long 0x9066A4F3 # rep movsb 24676249468aSthib 24686249468aSthib.Lcbc_dec_ret: 24696249468aSthib___ 24706249468aSthib$code.=<<___ if ($win64); 24715c104365Sjsing movaps 0x10(%rsp),%xmm6 24725c104365Sjsing movaps 0x20(%rsp),%xmm7 24735c104365Sjsing movaps 0x30(%rsp),%xmm8 24745c104365Sjsing movaps 0x40(%rsp),%xmm9 24756249468aSthib___ 24766249468aSthib$code.=<<___; 24775c104365Sjsing lea (%rbp),%rsp 24785c104365Sjsing pop %rbp 24796249468aSthib.Lcbc_ret: 24806249468aSthib ret 24816249468aSthib.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 24826249468aSthib___ 24835cdd308eSdjm} 24846249468aSthib# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, 24856249468aSthib# int bits, AES_KEY *key) 24866249468aSthib{ my ($inp,$bits,$key) = @_4args; 24876249468aSthib $bits =~ s/%r/%e/; 24886249468aSthib 24896249468aSthib$code.=<<___; 24906249468aSthib.globl ${PREFIX}_set_decrypt_key 24916249468aSthib.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 24926249468aSthib.align 16 24936249468aSthib${PREFIX}_set_decrypt_key: 249422787c51Stb _CET_ENDBR 2495b407f3afSmiod sub \$8,%rsp 24965cdd308eSdjm call __aesni_set_encrypt_key 24976249468aSthib shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 24986249468aSthib test %eax,%eax 24996249468aSthib jnz .Ldec_key_ret 25006249468aSthib lea 16($key,$bits),$inp # points at the end of key schedule 25016249468aSthib 25026249468aSthib $movkey ($key),%xmm0 # just swap 25036249468aSthib $movkey ($inp),%xmm1 25046249468aSthib $movkey %xmm0,($inp) 25056249468aSthib $movkey %xmm1,($key) 25066249468aSthib lea 16($key),$key 25076249468aSthib lea -16($inp),$inp 25086249468aSthib 25096249468aSthib.Ldec_key_inverse: 25106249468aSthib $movkey ($key),%xmm0 # swap and inverse 25116249468aSthib $movkey ($inp),%xmm1 25126249468aSthib aesimc %xmm0,%xmm0 25136249468aSthib aesimc %xmm1,%xmm1 25146249468aSthib lea 16($key),$key 25156249468aSthib lea -16($inp),$inp 25166249468aSthib $movkey %xmm0,16($inp) 25176249468aSthib $movkey %xmm1,-16($key) 25185cdd308eSdjm cmp $key,$inp 25196249468aSthib ja .Ldec_key_inverse 25206249468aSthib 25216249468aSthib $movkey ($key),%xmm0 # inverse middle 25226249468aSthib aesimc %xmm0,%xmm0 25236249468aSthib $movkey %xmm0,($inp) 25246249468aSthib.Ldec_key_ret: 25256249468aSthib add \$8,%rsp 25266249468aSthib ret 25276249468aSthib.LSEH_end_set_decrypt_key: 25286249468aSthib.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 25296249468aSthib___ 25306249468aSthib 25316249468aSthib# This is based on submission by 25326249468aSthib# 25336249468aSthib# Huang Ying <ying.huang@intel.com> 25346249468aSthib# Vinodh Gopal <vinodh.gopal@intel.com> 25356249468aSthib# Kahraman Akdemir 25366249468aSthib# 253771743258Sjmc# Aggressively optimized in respect to aeskeygenassist's critical path 25386249468aSthib# and is contained in %xmm0-5 to meet Win64 ABI requirement. 25396249468aSthib# 25406249468aSthib$code.=<<___; 25416249468aSthib.globl ${PREFIX}_set_encrypt_key 25426249468aSthib.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 25436249468aSthib.align 16 25446249468aSthib${PREFIX}_set_encrypt_key: 254522787c51Stb _CET_ENDBR 25465cdd308eSdjm__aesni_set_encrypt_key: 2547b407f3afSmiod sub \$8,%rsp 25486249468aSthib mov \$-1,%rax 25495cdd308eSdjm test $inp,$inp 25506249468aSthib jz .Lenc_key_ret 25516249468aSthib test $key,$key 25526249468aSthib jz .Lenc_key_ret 25536249468aSthib 25546249468aSthib movups ($inp),%xmm0 # pull first 128 bits of *userKey 25555cdd308eSdjm xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 25566249468aSthib lea 16($key),%rax 25576249468aSthib cmp \$256,$bits 25586249468aSthib je .L14rounds 25596249468aSthib cmp \$192,$bits 25606249468aSthib je .L12rounds 25616249468aSthib cmp \$128,$bits 25626249468aSthib jne .Lbad_keybits 25636249468aSthib 25646249468aSthib.L10rounds: 25656249468aSthib mov \$9,$bits # 10 rounds for 128-bit key 25666249468aSthib $movkey %xmm0,($key) # round 0 25676249468aSthib aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 25686249468aSthib call .Lkey_expansion_128_cold 25696249468aSthib aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 25706249468aSthib call .Lkey_expansion_128 25716249468aSthib aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 25726249468aSthib call .Lkey_expansion_128 25736249468aSthib aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 25746249468aSthib call .Lkey_expansion_128 25756249468aSthib aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 25766249468aSthib call .Lkey_expansion_128 25776249468aSthib aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 25786249468aSthib call .Lkey_expansion_128 25796249468aSthib aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 25806249468aSthib call .Lkey_expansion_128 25816249468aSthib aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 25826249468aSthib call .Lkey_expansion_128 25836249468aSthib aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 25846249468aSthib call .Lkey_expansion_128 25856249468aSthib aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 25866249468aSthib call .Lkey_expansion_128 25876249468aSthib $movkey %xmm0,(%rax) 25886249468aSthib mov $bits,80(%rax) # 240(%rdx) 25896249468aSthib xor %eax,%eax 25906249468aSthib jmp .Lenc_key_ret 25916249468aSthib 25926249468aSthib.align 16 25936249468aSthib.L12rounds: 25946249468aSthib movq 16($inp),%xmm2 # remaining 1/3 of *userKey 25956249468aSthib mov \$11,$bits # 12 rounds for 192 25966249468aSthib $movkey %xmm0,($key) # round 0 25976249468aSthib aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 25986249468aSthib call .Lkey_expansion_192a_cold 25996249468aSthib aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 26006249468aSthib call .Lkey_expansion_192b 26016249468aSthib aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 26026249468aSthib call .Lkey_expansion_192a 26036249468aSthib aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 26046249468aSthib call .Lkey_expansion_192b 26056249468aSthib aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 26066249468aSthib call .Lkey_expansion_192a 26076249468aSthib aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 26086249468aSthib call .Lkey_expansion_192b 26096249468aSthib aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 26106249468aSthib call .Lkey_expansion_192a 26116249468aSthib aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 26126249468aSthib call .Lkey_expansion_192b 26136249468aSthib $movkey %xmm0,(%rax) 26146249468aSthib mov $bits,48(%rax) # 240(%rdx) 26156249468aSthib xor %rax, %rax 26166249468aSthib jmp .Lenc_key_ret 26176249468aSthib 26186249468aSthib.align 16 26196249468aSthib.L14rounds: 262071743258Sjmc movups 16($inp),%xmm2 # remaining half of *userKey 26216249468aSthib mov \$13,$bits # 14 rounds for 256 26226249468aSthib lea 16(%rax),%rax 26236249468aSthib $movkey %xmm0,($key) # round 0 26246249468aSthib $movkey %xmm2,16($key) # round 1 26256249468aSthib aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 26266249468aSthib call .Lkey_expansion_256a_cold 26276249468aSthib aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 26286249468aSthib call .Lkey_expansion_256b 26296249468aSthib aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 26306249468aSthib call .Lkey_expansion_256a 26316249468aSthib aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 26326249468aSthib call .Lkey_expansion_256b 26336249468aSthib aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 26346249468aSthib call .Lkey_expansion_256a 26356249468aSthib aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 26366249468aSthib call .Lkey_expansion_256b 26376249468aSthib aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 26386249468aSthib call .Lkey_expansion_256a 26396249468aSthib aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 26406249468aSthib call .Lkey_expansion_256b 26416249468aSthib aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 26426249468aSthib call .Lkey_expansion_256a 26436249468aSthib aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 26446249468aSthib call .Lkey_expansion_256b 26456249468aSthib aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 26466249468aSthib call .Lkey_expansion_256a 26476249468aSthib aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 26486249468aSthib call .Lkey_expansion_256b 26496249468aSthib aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 26506249468aSthib call .Lkey_expansion_256a 26516249468aSthib $movkey %xmm0,(%rax) 26526249468aSthib mov $bits,16(%rax) # 240(%rdx) 26536249468aSthib xor %rax,%rax 26546249468aSthib jmp .Lenc_key_ret 26556249468aSthib 26566249468aSthib.align 16 26576249468aSthib.Lbad_keybits: 26586249468aSthib mov \$-2,%rax 26596249468aSthib.Lenc_key_ret: 26606249468aSthib add \$8,%rsp 26616249468aSthib ret 26626249468aSthib.LSEH_end_set_encrypt_key: 26636249468aSthib 26646249468aSthib.align 16 26656249468aSthib.Lkey_expansion_128: 26666249468aSthib $movkey %xmm0,(%rax) 26676249468aSthib lea 16(%rax),%rax 26686249468aSthib.Lkey_expansion_128_cold: 26696249468aSthib shufps \$0b00010000,%xmm0,%xmm4 26705cdd308eSdjm xorps %xmm4, %xmm0 26716249468aSthib shufps \$0b10001100,%xmm0,%xmm4 26725cdd308eSdjm xorps %xmm4, %xmm0 26735cdd308eSdjm shufps \$0b11111111,%xmm1,%xmm1 # critical path 26745cdd308eSdjm xorps %xmm1,%xmm0 26756249468aSthib ret 26766249468aSthib 26776249468aSthib.align 16 26786249468aSthib.Lkey_expansion_192a: 26796249468aSthib $movkey %xmm0,(%rax) 26806249468aSthib lea 16(%rax),%rax 26816249468aSthib.Lkey_expansion_192a_cold: 26826249468aSthib movaps %xmm2, %xmm5 26836249468aSthib.Lkey_expansion_192b_warm: 26846249468aSthib shufps \$0b00010000,%xmm0,%xmm4 26855cdd308eSdjm movdqa %xmm2,%xmm3 26865cdd308eSdjm xorps %xmm4,%xmm0 26876249468aSthib shufps \$0b10001100,%xmm0,%xmm4 26886249468aSthib pslldq \$4,%xmm3 26895cdd308eSdjm xorps %xmm4,%xmm0 26906249468aSthib pshufd \$0b01010101,%xmm1,%xmm1 # critical path 26916249468aSthib pxor %xmm3,%xmm2 26926249468aSthib pxor %xmm1,%xmm0 26936249468aSthib pshufd \$0b11111111,%xmm0,%xmm3 26946249468aSthib pxor %xmm3,%xmm2 26956249468aSthib ret 26966249468aSthib 26976249468aSthib.align 16 26986249468aSthib.Lkey_expansion_192b: 26996249468aSthib movaps %xmm0,%xmm3 27006249468aSthib shufps \$0b01000100,%xmm0,%xmm5 27016249468aSthib $movkey %xmm5,(%rax) 27026249468aSthib shufps \$0b01001110,%xmm2,%xmm3 27036249468aSthib $movkey %xmm3,16(%rax) 27046249468aSthib lea 32(%rax),%rax 27056249468aSthib jmp .Lkey_expansion_192b_warm 27066249468aSthib 27076249468aSthib.align 16 27086249468aSthib.Lkey_expansion_256a: 27096249468aSthib $movkey %xmm2,(%rax) 27106249468aSthib lea 16(%rax),%rax 27116249468aSthib.Lkey_expansion_256a_cold: 27126249468aSthib shufps \$0b00010000,%xmm0,%xmm4 27135cdd308eSdjm xorps %xmm4,%xmm0 27146249468aSthib shufps \$0b10001100,%xmm0,%xmm4 27155cdd308eSdjm xorps %xmm4,%xmm0 27165cdd308eSdjm shufps \$0b11111111,%xmm1,%xmm1 # critical path 27175cdd308eSdjm xorps %xmm1,%xmm0 27186249468aSthib ret 27196249468aSthib 27206249468aSthib.align 16 27216249468aSthib.Lkey_expansion_256b: 27226249468aSthib $movkey %xmm0,(%rax) 27236249468aSthib lea 16(%rax),%rax 27246249468aSthib 27256249468aSthib shufps \$0b00010000,%xmm2,%xmm4 27265cdd308eSdjm xorps %xmm4,%xmm2 27276249468aSthib shufps \$0b10001100,%xmm2,%xmm4 27285cdd308eSdjm xorps %xmm4,%xmm2 27295cdd308eSdjm shufps \$0b10101010,%xmm1,%xmm1 # critical path 27305cdd308eSdjm xorps %xmm1,%xmm2 27316249468aSthib ret 27326249468aSthib.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 27335cdd308eSdjm.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 27346249468aSthib___ 27356249468aSthib} 27366249468aSthib 27376249468aSthib$code.=<<___; 2738eda85684Stb.section .rodata 27395cdd308eSdjm.align 64 27405cdd308eSdjm.Lbswap_mask: 27415cdd308eSdjm .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 27425cdd308eSdjm.Lincrement32: 27435cdd308eSdjm .long 6,6,6,0 27445cdd308eSdjm.Lincrement64: 27455cdd308eSdjm .long 1,0,0,0 27465cdd308eSdjm.Lxts_magic: 27475cdd308eSdjm .long 0x87,0,1,0 27486249468aSthib.align 64 274908705922Stb.text 27506249468aSthib___ 27516249468aSthib 27526249468aSthib# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 27536249468aSthib# CONTEXT *context,DISPATCHER_CONTEXT *disp) 27546249468aSthibif ($win64) { 27556249468aSthib$rec="%rcx"; 27566249468aSthib$frame="%rdx"; 27576249468aSthib$context="%r8"; 27586249468aSthib$disp="%r9"; 27596249468aSthib 27606249468aSthib$code.=<<___; 27616249468aSthib.extern __imp_RtlVirtualUnwind 27625cdd308eSdjm___ 27635cdd308eSdjm$code.=<<___ if ($PREFIX eq "aesni"); 27645cdd308eSdjm.type ecb_se_handler,\@abi-omnipotent 27655cdd308eSdjm.align 16 27665cdd308eSdjmecb_se_handler: 276722787c51Stb _CET_ENDBR 27685cdd308eSdjm push %rsi 27695cdd308eSdjm push %rdi 27705cdd308eSdjm push %rbx 27715cdd308eSdjm push %rbp 27725cdd308eSdjm push %r12 27735cdd308eSdjm push %r13 27745cdd308eSdjm push %r14 27755cdd308eSdjm push %r15 27765cdd308eSdjm pushfq 27775cdd308eSdjm sub \$64,%rsp 27785cdd308eSdjm 27795cdd308eSdjm mov 152($context),%rax # pull context->Rsp 27805cdd308eSdjm 27815cdd308eSdjm jmp .Lcommon_seh_tail 27825cdd308eSdjm.size ecb_se_handler,.-ecb_se_handler 27835cdd308eSdjm 27845cdd308eSdjm.type ccm64_se_handler,\@abi-omnipotent 27855cdd308eSdjm.align 16 27865cdd308eSdjmccm64_se_handler: 278722787c51Stb _CET_ENDBR 27885cdd308eSdjm push %rsi 27895cdd308eSdjm push %rdi 27905cdd308eSdjm push %rbx 27915cdd308eSdjm push %rbp 27925cdd308eSdjm push %r12 27935cdd308eSdjm push %r13 27945cdd308eSdjm push %r14 27955cdd308eSdjm push %r15 27965cdd308eSdjm pushfq 27975cdd308eSdjm sub \$64,%rsp 27985cdd308eSdjm 27995cdd308eSdjm mov 120($context),%rax # pull context->Rax 28005cdd308eSdjm mov 248($context),%rbx # pull context->Rip 28015cdd308eSdjm 28025cdd308eSdjm mov 8($disp),%rsi # disp->ImageBase 28035cdd308eSdjm mov 56($disp),%r11 # disp->HandlerData 28045cdd308eSdjm 28055cdd308eSdjm mov 0(%r11),%r10d # HandlerData[0] 28065cdd308eSdjm lea (%rsi,%r10),%r10 # prologue label 28075cdd308eSdjm cmp %r10,%rbx # context->Rip<prologue label 28085cdd308eSdjm jb .Lcommon_seh_tail 28095cdd308eSdjm 28105cdd308eSdjm mov 152($context),%rax # pull context->Rsp 28115cdd308eSdjm 28125cdd308eSdjm mov 4(%r11),%r10d # HandlerData[1] 28135cdd308eSdjm lea (%rsi,%r10),%r10 # epilogue label 28145cdd308eSdjm cmp %r10,%rbx # context->Rip>=epilogue label 28155cdd308eSdjm jae .Lcommon_seh_tail 28165cdd308eSdjm 28175cdd308eSdjm lea 0(%rax),%rsi # %xmm save area 28185cdd308eSdjm lea 512($context),%rdi # &context.Xmm6 28195cdd308eSdjm mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 28205cdd308eSdjm .long 0xa548f3fc # cld; rep movsq 28215cdd308eSdjm lea 0x58(%rax),%rax # adjust stack pointer 28225cdd308eSdjm 28235cdd308eSdjm jmp .Lcommon_seh_tail 28245cdd308eSdjm.size ccm64_se_handler,.-ccm64_se_handler 28255cdd308eSdjm 28265cdd308eSdjm.type ctr32_se_handler,\@abi-omnipotent 28275cdd308eSdjm.align 16 28285cdd308eSdjmctr32_se_handler: 282922787c51Stb _CET_ENDBR 28305cdd308eSdjm push %rsi 28315cdd308eSdjm push %rdi 28325cdd308eSdjm push %rbx 28335cdd308eSdjm push %rbp 28345cdd308eSdjm push %r12 28355cdd308eSdjm push %r13 28365cdd308eSdjm push %r14 28375cdd308eSdjm push %r15 28385cdd308eSdjm pushfq 28395cdd308eSdjm sub \$64,%rsp 28405cdd308eSdjm 28415cdd308eSdjm mov 120($context),%rax # pull context->Rax 28425cdd308eSdjm mov 248($context),%rbx # pull context->Rip 28435cdd308eSdjm 28445cdd308eSdjm lea .Lctr32_body(%rip),%r10 28455cdd308eSdjm cmp %r10,%rbx # context->Rip<"prologue" label 28465cdd308eSdjm jb .Lcommon_seh_tail 28475cdd308eSdjm 28485cdd308eSdjm mov 152($context),%rax # pull context->Rsp 28495cdd308eSdjm 28505cdd308eSdjm lea .Lctr32_ret(%rip),%r10 28515cdd308eSdjm cmp %r10,%rbx 28525cdd308eSdjm jae .Lcommon_seh_tail 28535cdd308eSdjm 28545cdd308eSdjm lea 0x20(%rax),%rsi # %xmm save area 28555cdd308eSdjm lea 512($context),%rdi # &context.Xmm6 28565cdd308eSdjm mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 28575cdd308eSdjm .long 0xa548f3fc # cld; rep movsq 28585cdd308eSdjm 28595c104365Sjsing jmp .Lcommon_rbp_tail 28605cdd308eSdjm.size ctr32_se_handler,.-ctr32_se_handler 28615cdd308eSdjm 28625cdd308eSdjm.type xts_se_handler,\@abi-omnipotent 28635cdd308eSdjm.align 16 28645cdd308eSdjmxts_se_handler: 286522787c51Stb _CET_ENDBR 28665cdd308eSdjm push %rsi 28675cdd308eSdjm push %rdi 28685cdd308eSdjm push %rbx 28695cdd308eSdjm push %rbp 28705cdd308eSdjm push %r12 28715cdd308eSdjm push %r13 28725cdd308eSdjm push %r14 28735cdd308eSdjm push %r15 28745cdd308eSdjm pushfq 28755cdd308eSdjm sub \$64,%rsp 28765cdd308eSdjm 28775cdd308eSdjm mov 120($context),%rax # pull context->Rax 28785cdd308eSdjm mov 248($context),%rbx # pull context->Rip 28795cdd308eSdjm 28805cdd308eSdjm mov 8($disp),%rsi # disp->ImageBase 28815cdd308eSdjm mov 56($disp),%r11 # disp->HandlerData 28825cdd308eSdjm 28835cdd308eSdjm mov 0(%r11),%r10d # HandlerData[0] 288471743258Sjmc lea (%rsi,%r10),%r10 # prologue label 28855cdd308eSdjm cmp %r10,%rbx # context->Rip<prologue label 28865cdd308eSdjm jb .Lcommon_seh_tail 28875cdd308eSdjm 28885cdd308eSdjm mov 152($context),%rax # pull context->Rsp 28895cdd308eSdjm 28905cdd308eSdjm mov 4(%r11),%r10d # HandlerData[1] 28915cdd308eSdjm lea (%rsi,%r10),%r10 # epilogue label 28925cdd308eSdjm cmp %r10,%rbx # context->Rip>=epilogue label 28935cdd308eSdjm jae .Lcommon_seh_tail 28945cdd308eSdjm 28955cdd308eSdjm lea 0x60(%rax),%rsi # %xmm save area 28965cdd308eSdjm lea 512($context),%rdi # & context.Xmm6 28975cdd308eSdjm mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 28985cdd308eSdjm .long 0xa548f3fc # cld; rep movsq 28995cdd308eSdjm 29005c104365Sjsing jmp .Lcommon_rbp_tail 29015cdd308eSdjm.size xts_se_handler,.-xts_se_handler 29025cdd308eSdjm___ 29035cdd308eSdjm$code.=<<___; 29046249468aSthib.type cbc_se_handler,\@abi-omnipotent 29056249468aSthib.align 16 29066249468aSthibcbc_se_handler: 290722787c51Stb _CET_ENDBR 29086249468aSthib push %rsi 29096249468aSthib push %rdi 29106249468aSthib push %rbx 29116249468aSthib push %rbp 29126249468aSthib push %r12 29136249468aSthib push %r13 29146249468aSthib push %r14 29156249468aSthib push %r15 29166249468aSthib pushfq 29176249468aSthib sub \$64,%rsp 29186249468aSthib 29196249468aSthib mov 152($context),%rax # pull context->Rsp 29206249468aSthib mov 248($context),%rbx # pull context->Rip 29216249468aSthib 29226249468aSthib lea .Lcbc_decrypt(%rip),%r10 29236249468aSthib cmp %r10,%rbx # context->Rip<"prologue" label 29245cdd308eSdjm jb .Lcommon_seh_tail 29256249468aSthib 29266249468aSthib lea .Lcbc_decrypt_body(%rip),%r10 29276249468aSthib cmp %r10,%rbx # context->Rip<cbc_decrypt_body 29285cdd308eSdjm jb .Lrestore_cbc_rax 29296249468aSthib 29306249468aSthib lea .Lcbc_ret(%rip),%r10 29316249468aSthib cmp %r10,%rbx # context->Rip>="epilogue" label 29325cdd308eSdjm jae .Lcommon_seh_tail 29336249468aSthib 29345c104365Sjsing lea 16(%rax),%rsi # %xmm save area 29356249468aSthib lea 512($context),%rdi # &context.Xmm6 29366249468aSthib mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 29376249468aSthib .long 0xa548f3fc # cld; rep movsq 29385c104365Sjsing 29395c104365Sjsing.Lcommon_rbp_tail: 29405c104365Sjsing mov 160($context),%rax # pull context->Rbp 29415c104365Sjsing mov (%rax),%rbp # restore saved %rbp 29425c104365Sjsing lea 8(%rax),%rax # adjust stack pointer 29435c104365Sjsing mov %rbp,160($context) # restore context->Rbp 29445cdd308eSdjm jmp .Lcommon_seh_tail 29456249468aSthib 29465cdd308eSdjm.Lrestore_cbc_rax: 29476249468aSthib mov 120($context),%rax 29485cdd308eSdjm 29495cdd308eSdjm.Lcommon_seh_tail: 29506249468aSthib mov 8(%rax),%rdi 29516249468aSthib mov 16(%rax),%rsi 29526249468aSthib mov %rax,152($context) # restore context->Rsp 29536249468aSthib mov %rsi,168($context) # restore context->Rsi 29546249468aSthib mov %rdi,176($context) # restore context->Rdi 29556249468aSthib 29566249468aSthib mov 40($disp),%rdi # disp->ContextRecord 29576249468aSthib mov $context,%rsi # context 29586249468aSthib mov \$154,%ecx # sizeof(CONTEXT) 29596249468aSthib .long 0xa548f3fc # cld; rep movsq 29606249468aSthib 29616249468aSthib mov $disp,%rsi 29626249468aSthib xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 29636249468aSthib mov 8(%rsi),%rdx # arg2, disp->ImageBase 29646249468aSthib mov 0(%rsi),%r8 # arg3, disp->ControlPc 29656249468aSthib mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 29666249468aSthib mov 40(%rsi),%r10 # disp->ContextRecord 29676249468aSthib lea 56(%rsi),%r11 # &disp->HandlerData 29686249468aSthib lea 24(%rsi),%r12 # &disp->EstablisherFrame 29696249468aSthib mov %r10,32(%rsp) # arg5 29706249468aSthib mov %r11,40(%rsp) # arg6 29716249468aSthib mov %r12,48(%rsp) # arg7 29726249468aSthib mov %rcx,56(%rsp) # arg8, (NULL) 29736249468aSthib call *__imp_RtlVirtualUnwind(%rip) 29746249468aSthib 29756249468aSthib mov \$1,%eax # ExceptionContinueSearch 29766249468aSthib add \$64,%rsp 29776249468aSthib popfq 29786249468aSthib pop %r15 29796249468aSthib pop %r14 29806249468aSthib pop %r13 29816249468aSthib pop %r12 29826249468aSthib pop %rbp 29836249468aSthib pop %rbx 29846249468aSthib pop %rdi 29856249468aSthib pop %rsi 29866249468aSthib ret 29876249468aSthib.size cbc_se_handler,.-cbc_se_handler 29886249468aSthib 29896249468aSthib.section .pdata 29906249468aSthib.align 4 29915cdd308eSdjm___ 29925cdd308eSdjm$code.=<<___ if ($PREFIX eq "aesni"); 29935cdd308eSdjm .rva .LSEH_begin_aesni_ecb_encrypt 29945cdd308eSdjm .rva .LSEH_end_aesni_ecb_encrypt 29956249468aSthib .rva .LSEH_info_ecb 29966249468aSthib 29975cdd308eSdjm .rva .LSEH_begin_aesni_ccm64_encrypt_blocks 29985cdd308eSdjm .rva .LSEH_end_aesni_ccm64_encrypt_blocks 29995cdd308eSdjm .rva .LSEH_info_ccm64_enc 30005cdd308eSdjm 30015cdd308eSdjm .rva .LSEH_begin_aesni_ccm64_decrypt_blocks 30025cdd308eSdjm .rva .LSEH_end_aesni_ccm64_decrypt_blocks 30035cdd308eSdjm .rva .LSEH_info_ccm64_dec 30045cdd308eSdjm 30055cdd308eSdjm .rva .LSEH_begin_aesni_ctr32_encrypt_blocks 30065cdd308eSdjm .rva .LSEH_end_aesni_ctr32_encrypt_blocks 30075cdd308eSdjm .rva .LSEH_info_ctr32 30085cdd308eSdjm 30095cdd308eSdjm .rva .LSEH_begin_aesni_xts_encrypt 30105cdd308eSdjm .rva .LSEH_end_aesni_xts_encrypt 30115cdd308eSdjm .rva .LSEH_info_xts_enc 30125cdd308eSdjm 30135cdd308eSdjm .rva .LSEH_begin_aesni_xts_decrypt 30145cdd308eSdjm .rva .LSEH_end_aesni_xts_decrypt 30155cdd308eSdjm .rva .LSEH_info_xts_dec 30165cdd308eSdjm___ 30175cdd308eSdjm$code.=<<___; 30186249468aSthib .rva .LSEH_begin_${PREFIX}_cbc_encrypt 30196249468aSthib .rva .LSEH_end_${PREFIX}_cbc_encrypt 30206249468aSthib .rva .LSEH_info_cbc 30216249468aSthib 30226249468aSthib .rva ${PREFIX}_set_decrypt_key 30236249468aSthib .rva .LSEH_end_set_decrypt_key 30246249468aSthib .rva .LSEH_info_key 30256249468aSthib 30266249468aSthib .rva ${PREFIX}_set_encrypt_key 30276249468aSthib .rva .LSEH_end_set_encrypt_key 30286249468aSthib .rva .LSEH_info_key 30296249468aSthib.section .xdata 30306249468aSthib.align 8 30315cdd308eSdjm___ 30325cdd308eSdjm$code.=<<___ if ($PREFIX eq "aesni"); 30336249468aSthib.LSEH_info_ecb: 30346249468aSthib .byte 9,0,0,0 30356249468aSthib .rva ecb_se_handler 30365cdd308eSdjm.LSEH_info_ccm64_enc: 30375cdd308eSdjm .byte 9,0,0,0 30385cdd308eSdjm .rva ccm64_se_handler 30395cdd308eSdjm .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] 30405cdd308eSdjm.LSEH_info_ccm64_dec: 30415cdd308eSdjm .byte 9,0,0,0 30425cdd308eSdjm .rva ccm64_se_handler 30435cdd308eSdjm .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] 30445cdd308eSdjm.LSEH_info_ctr32: 30455cdd308eSdjm .byte 9,0,0,0 30465cdd308eSdjm .rva ctr32_se_handler 30475cdd308eSdjm.LSEH_info_xts_enc: 30485cdd308eSdjm .byte 9,0,0,0 30495cdd308eSdjm .rva xts_se_handler 30505cdd308eSdjm .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 30515cdd308eSdjm.LSEH_info_xts_dec: 30525cdd308eSdjm .byte 9,0,0,0 30535cdd308eSdjm .rva xts_se_handler 30545cdd308eSdjm .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 30555cdd308eSdjm___ 30565cdd308eSdjm$code.=<<___; 30576249468aSthib.LSEH_info_cbc: 30586249468aSthib .byte 9,0,0,0 30596249468aSthib .rva cbc_se_handler 30606249468aSthib.LSEH_info_key: 30616249468aSthib .byte 0x01,0x04,0x01,0x00 30625cdd308eSdjm .byte 0x04,0x02,0x00,0x00 # sub rsp,8 30636249468aSthib___ 30646249468aSthib} 30656249468aSthib 30666249468aSthibsub rex { 30676249468aSthib local *opcode=shift; 30686249468aSthib my ($dst,$src)=@_; 30695cdd308eSdjm my $rex=0; 30706249468aSthib 30716249468aSthib $rex|=0x04 if($dst>=8); 30726249468aSthib $rex|=0x01 if($src>=8); 30735cdd308eSdjm push @opcode,$rex|0x40 if($rex); 30746249468aSthib} 30756249468aSthib 30766249468aSthib$code =~ s/\`([^\`]*)\`/eval($1)/gem; 30776249468aSthib 30786249468aSthibprint $code; 30796249468aSthib 30806249468aSthibclose STDOUT; 3081