xref: /openbsd/lib/libcrypto/aes/asm/aesni-x86_64.pl (revision 5caf18b2)
16249468aSthib#!/usr/bin/env perl
26249468aSthib#
36249468aSthib# ====================================================================
46249468aSthib# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
56249468aSthib# project. The module is, however, dual licensed under OpenSSL and
66249468aSthib# CRYPTOGAMS licenses depending on where you obtain it. For further
76249468aSthib# details see http://www.openssl.org/~appro/cryptogams/.
86249468aSthib# ====================================================================
96249468aSthib#
106249468aSthib# This module implements support for Intel AES-NI extension. In
116249468aSthib# OpenSSL context it's used with Intel engine, but can also be used as
126249468aSthib# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
136249468aSthib# details].
145cdd308eSdjm#
155cdd308eSdjm# Performance.
165cdd308eSdjm#
175cdd308eSdjm# Given aes(enc|dec) instructions' latency asymptotic performance for
185cdd308eSdjm# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
195cdd308eSdjm# processed with 128-bit key. And given their throughput asymptotic
205cdd308eSdjm# performance for parallelizable modes is 1.25 cycles per byte. Being
215cdd308eSdjm# asymptotic limit it's not something you commonly achieve in reality,
225cdd308eSdjm# but how close does one get? Below are results collected for
235cdd308eSdjm# different modes and block sized. Pairs of numbers are for en-/
245cdd308eSdjm# decryption.
255cdd308eSdjm#
265cdd308eSdjm#	16-byte     64-byte     256-byte    1-KB        8-KB
275cdd308eSdjm# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
285cdd308eSdjm# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
295cdd308eSdjm# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
305cdd308eSdjm# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
315cdd308eSdjm# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
325cdd308eSdjm# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
335cdd308eSdjm#
345cdd308eSdjm# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
355cdd308eSdjm# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
365cdd308eSdjm# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
375cdd308eSdjm# The results were collected with specially crafted speed.c benchmark
385cdd308eSdjm# in order to compare them with results reported in "Intel Advanced
395cdd308eSdjm# Encryption Standard (AES) New Instruction Set" White Paper Revision
405cdd308eSdjm# 3.0 dated May 2010. All above results are consistently better. This
415cdd308eSdjm# module also provides better performance for block sizes smaller than
425cdd308eSdjm# 128 bytes in points *not* represented in the above table.
435cdd308eSdjm#
445cdd308eSdjm# Looking at the results for 8-KB buffer.
455cdd308eSdjm#
465cdd308eSdjm# CFB and OFB results are far from the limit, because implementation
475cdd308eSdjm# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
485cdd308eSdjm# single-block aesni_encrypt, which is not the most optimal way to go.
495cdd308eSdjm# CBC encrypt result is unexpectedly high and there is no documented
505cdd308eSdjm# explanation for it. Seemingly there is a small penalty for feeding
515cdd308eSdjm# the result back to AES unit the way it's done in CBC mode. There is
525cdd308eSdjm# nothing one can do and the result appears optimal. CCM result is
535cdd308eSdjm# identical to CBC, because CBC-MAC is essentially CBC encrypt without
545cdd308eSdjm# saving output. CCM CTR "stays invisible," because it's neatly
5571743258Sjmc# interleaved with CBC-MAC. This provides ~30% improvement over
565cdd308eSdjm# "straghtforward" CCM implementation with CTR and CBC-MAC performed
575cdd308eSdjm# disjointly. Parallelizable modes practically achieve the theoretical
585cdd308eSdjm# limit.
595cdd308eSdjm#
605cdd308eSdjm# Looking at how results vary with buffer size.
615cdd308eSdjm#
625cdd308eSdjm# Curves are practically saturated at 1-KB buffer size. In most cases
635cdd308eSdjm# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
645cdd308eSdjm# CTR curve doesn't follow this pattern and is "slowest" changing one
655cdd308eSdjm# with "256-byte" result being 87% of "8-KB." This is because overhead
665cdd308eSdjm# in CTR mode is most computationally intensive. Small-block CCM
675cdd308eSdjm# decrypt is slower than encrypt, because first CTR and last CBC-MAC
685cdd308eSdjm# iterations can't be interleaved.
695cdd308eSdjm#
705cdd308eSdjm# Results for 192- and 256-bit keys.
715cdd308eSdjm#
725cdd308eSdjm# EVP-free results were observed to scale perfectly with number of
735cdd308eSdjm# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
745cdd308eSdjm# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
755cdd308eSdjm# are a tad smaller, because the above mentioned penalty biases all
765cdd308eSdjm# results by same constant value. In similar way function call
775cdd308eSdjm# overhead affects small-block performance, as well as OFB and CFB
785cdd308eSdjm# results. Differences are not large, most common coefficients are
795cdd308eSdjm# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
805cdd308eSdjm# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
815cdd308eSdjm
825cdd308eSdjm# January 2011
835cdd308eSdjm#
845cdd308eSdjm# While Westmere processor features 6 cycles latency for aes[enc|dec]
855cdd308eSdjm# instructions, which can be scheduled every second cycle, Sandy
865cdd308eSdjm# Bridge spends 8 cycles per instruction, but it can schedule them
875cdd308eSdjm# every cycle. This means that code targeting Westmere would perform
885cdd308eSdjm# suboptimally on Sandy Bridge. Therefore this update.
895cdd308eSdjm#
905cdd308eSdjm# In addition, non-parallelizable CBC encrypt (as well as CCM) is
915cdd308eSdjm# optimized. Relative improvement might appear modest, 8% on Westmere,
925cdd308eSdjm# but in absolute terms it's 3.77 cycles per byte encrypted with
935cdd308eSdjm# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
945cdd308eSdjm# should be compared to asymptotic limits of 3.75 for Westmere and
955cdd308eSdjm# 5.00 for Sandy Bridge. Actually, the fact that they get this close
965cdd308eSdjm# to asymptotic limits is quite amazing. Indeed, the limit is
975cdd308eSdjm# calculated as latency times number of rounds, 10 for 128-bit key,
985cdd308eSdjm# and divided by 16, the number of bytes in block, or in other words
995cdd308eSdjm# it accounts *solely* for aesenc instructions. But there are extra
1005cdd308eSdjm# instructions, and numbers so close to the asymptotic limits mean
1015cdd308eSdjm# that it's as if it takes as little as *one* additional cycle to
1025cdd308eSdjm# execute all of them. How is it possible? It is possible thanks to
1035cdd308eSdjm# out-of-order execution logic, which manages to overlap post-
1045cdd308eSdjm# processing of previous block, things like saving the output, with
1055cdd308eSdjm# actual encryption of current block, as well as pre-processing of
1065cdd308eSdjm# current block, things like fetching input and xor-ing it with
1075cdd308eSdjm# 0-round element of the key schedule, with actual encryption of
1085cdd308eSdjm# previous block. Keep this in mind...
1095cdd308eSdjm#
1105cdd308eSdjm# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
1115cdd308eSdjm# performance is achieved by interleaving instructions working on
1125cdd308eSdjm# independent blocks. In which case asymptotic limit for such modes
1135cdd308eSdjm# can be obtained by dividing above mentioned numbers by AES
1145cdd308eSdjm# instructions' interleave factor. Westmere can execute at most 3
1155cdd308eSdjm# instructions at a time, meaning that optimal interleave factor is 3,
1165cdd308eSdjm# and that's where the "magic" number of 1.25 come from. "Optimal
1175cdd308eSdjm# interleave factor" means that increase of interleave factor does
1185cdd308eSdjm# not improve performance. The formula has proven to reflect reality
1195cdd308eSdjm# pretty well on Westmere... Sandy Bridge on the other hand can
1205cdd308eSdjm# execute up to 8 AES instructions at a time, so how does varying
1215cdd308eSdjm# interleave factor affect the performance? Here is table for ECB
1225cdd308eSdjm# (numbers are cycles per byte processed with 128-bit key):
1235cdd308eSdjm#
1245cdd308eSdjm# instruction interleave factor		3x	6x	8x
1255cdd308eSdjm# theoretical asymptotic limit		1.67	0.83	0.625
1265cdd308eSdjm# measured performance for 8KB block	1.05	0.86	0.84
1275cdd308eSdjm#
1285cdd308eSdjm# "as if" interleave factor		4.7x	5.8x	6.0x
1295cdd308eSdjm#
1305cdd308eSdjm# Further data for other parallelizable modes:
1315cdd308eSdjm#
1325cdd308eSdjm# CBC decrypt				1.16	0.93	0.93
1335cdd308eSdjm# CTR					1.14	0.91	n/a
1345cdd308eSdjm#
1355cdd308eSdjm# Well, given 3x column it's probably inappropriate to call the limit
1365cdd308eSdjm# asymptotic, if it can be surpassed, isn't it? What happens there?
1375cdd308eSdjm# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
1385cdd308eSdjm# magic is responsible for this. Processor overlaps not only the
13971743258Sjmc# additional instructions with AES ones, but even AES instructions
1405cdd308eSdjm# processing adjacent triplets of independent blocks. In the 6x case
1415cdd308eSdjm# additional instructions  still claim disproportionally small amount
1425cdd308eSdjm# of additional cycles, but in 8x case number of instructions must be
1435cdd308eSdjm# a tad too high for out-of-order logic to cope with, and AES unit
1445cdd308eSdjm# remains underutilized... As you can see 8x interleave is hardly
1455cdd308eSdjm# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
1465cdd308eSdjm# utilizies 6x interleave because of limited register bank capacity.
1475cdd308eSdjm#
1485cdd308eSdjm# Higher interleave factors do have negative impact on Westmere
1495cdd308eSdjm# performance. While for ECB mode it's negligible ~1.5%, other
1505cdd308eSdjm# parallelizables perform ~5% worse, which is outweighed by ~25%
1515cdd308eSdjm# improvement on Sandy Bridge. To balance regression on Westmere
1525cdd308eSdjm# CTR mode was implemented with 6x aesenc interleave factor.
1535cdd308eSdjm
1545cdd308eSdjm# April 2011
1555cdd308eSdjm#
1565cdd308eSdjm# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
1575cdd308eSdjm# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
1585cdd308eSdjm# in CTR mode AES instruction interleave factor was chosen to be 6x.
1596249468aSthib
1606249468aSthib$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
1616249468aSthib			# generates drop-in replacement for
1626249468aSthib			# crypto/aes/asm/aes-x86_64.pl:-)
1636249468aSthib
1646249468aSthib$flavour = shift;
1656249468aSthib$output  = shift;
1666249468aSthibif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
1676249468aSthib
1686249468aSthib$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
1696249468aSthib
1706249468aSthib$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
1716249468aSthib( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
1726249468aSthib( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
1736249468aSthibdie "can't locate x86_64-xlate.pl";
1746249468aSthib
17597222eddSmiodopen OUT,"| \"$^X\" $xlate $flavour $output";
17697222eddSmiod*STDOUT=*OUT;
1776249468aSthib
1785cdd308eSdjm$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
1796249468aSthib@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
1806249468aSthib		("%rdi","%rsi","%rdx","%rcx");	# Unix order
1816249468aSthib
1826249468aSthib$code=".text\n";
1836249468aSthib
1846249468aSthib$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
1856249468aSthib# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
1866249468aSthib$inp="%rdi";
1876249468aSthib$out="%rsi";
1886249468aSthib$len="%rdx";
1896249468aSthib$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
1905cdd308eSdjm$ivp="%r8";	# cbc, ctr, ...
1916249468aSthib
1926249468aSthib$rnds_="%r10d";	# backup copy for $rounds
1936249468aSthib$key_="%r11";	# backup copy for $key
1946249468aSthib
1956249468aSthib# %xmm register layout
1965cdd308eSdjm$rndkey0="%xmm0";	$rndkey1="%xmm1";
1975cdd308eSdjm$inout0="%xmm2";	$inout1="%xmm3";
1985cdd308eSdjm$inout2="%xmm4";	$inout3="%xmm5";
1995cdd308eSdjm$inout4="%xmm6";	$inout5="%xmm7";
2005cdd308eSdjm$inout6="%xmm8";	$inout7="%xmm9";
2016249468aSthib
2025cdd308eSdjm$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
2035cdd308eSdjm$in0="%xmm8";		$iv="%xmm9";
2046249468aSthib
2056249468aSthib# Inline version of internal aesni_[en|de]crypt1.
2066249468aSthib#
2076249468aSthib# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
2086249468aSthib# cycles which take care of loop variables...
2096249468aSthib{ my $sn;
2106249468aSthibsub aesni_generate1 {
2115cdd308eSdjmmy ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
2126249468aSthib++$sn;
2136249468aSthib$code.=<<___;
2146249468aSthib	$movkey	($key),$rndkey0
2156249468aSthib	$movkey	16($key),$rndkey1
2165cdd308eSdjm___
2175cdd308eSdjm$code.=<<___ if (defined($ivec));
2185cdd308eSdjm	xorps	$rndkey0,$ivec
2196249468aSthib	lea	32($key),$key
2205cdd308eSdjm	xorps	$ivec,$inout
2215cdd308eSdjm___
2225cdd308eSdjm$code.=<<___ if (!defined($ivec));
2235cdd308eSdjm	lea	32($key),$key
2245cdd308eSdjm	xorps	$rndkey0,$inout
2255cdd308eSdjm___
2265cdd308eSdjm$code.=<<___;
2276249468aSthib.Loop_${p}1_$sn:
2285cdd308eSdjm	aes${p}	$rndkey1,$inout
2296249468aSthib	dec	$rounds
2306249468aSthib	$movkey	($key),$rndkey1
2316249468aSthib	lea	16($key),$key
2326249468aSthib	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
2335cdd308eSdjm	aes${p}last	$rndkey1,$inout
2346249468aSthib___
2356249468aSthib}}
2366249468aSthib# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
2376249468aSthib#
2386249468aSthib{ my ($inp,$out,$key) = @_4args;
2396249468aSthib
2406249468aSthib$code.=<<___;
2416249468aSthib.globl	${PREFIX}_encrypt
2426249468aSthib.type	${PREFIX}_encrypt,\@abi-omnipotent
2436249468aSthib.align	16
2446249468aSthib${PREFIX}_encrypt:
24522787c51Stb	_CET_ENDBR
2466249468aSthib	movups	($inp),$inout0		# load input
2475cdd308eSdjm	mov	240($key),$rounds	# key->rounds
2486249468aSthib___
2496249468aSthib	&aesni_generate1("enc",$key,$rounds);
2506249468aSthib$code.=<<___;
2516249468aSthib	movups	$inout0,($out)		# output
2526249468aSthib	ret
2536249468aSthib.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
2546249468aSthib
2556249468aSthib.globl	${PREFIX}_decrypt
2566249468aSthib.type	${PREFIX}_decrypt,\@abi-omnipotent
2576249468aSthib.align	16
2586249468aSthib${PREFIX}_decrypt:
25922787c51Stb	_CET_ENDBR
2606249468aSthib	movups	($inp),$inout0		# load input
2615cdd308eSdjm	mov	240($key),$rounds	# key->rounds
2626249468aSthib___
2636249468aSthib	&aesni_generate1("dec",$key,$rounds);
2646249468aSthib$code.=<<___;
2656249468aSthib	movups	$inout0,($out)		# output
2666249468aSthib	ret
2676249468aSthib.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
2686249468aSthib___
2696249468aSthib}
2706249468aSthib
2715cdd308eSdjm# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
2725cdd308eSdjm# factor. Why 3x subroutine were originally used in loops? Even though
2735cdd308eSdjm# aes[enc|dec] latency was originally 6, it could be scheduled only
2745cdd308eSdjm# every *2nd* cycle. Thus 3x interleave was the one providing optimal
2756249468aSthib# utilization, i.e. when subroutine's throughput is virtually same as
2766249468aSthib# of non-interleaved subroutine [for number of input blocks up to 3].
2775cdd308eSdjm# This is why it makes no sense to implement 2x subroutine.
2785cdd308eSdjm# aes[enc|dec] latency in next processor generation is 8, but the
2795cdd308eSdjm# instructions can be scheduled every cycle. Optimal interleave for
2805cdd308eSdjm# new processor is therefore 8x...
2816249468aSthibsub aesni_generate3 {
2826249468aSthibmy $dir=shift;
2836249468aSthib# As already mentioned it takes in $key and $rounds, which are *not*
2846249468aSthib# preserved. $inout[0-2] is cipher/clear text...
2856249468aSthib$code.=<<___;
2866249468aSthib.type	_aesni_${dir}rypt3,\@abi-omnipotent
2876249468aSthib.align	16
2886249468aSthib_aesni_${dir}rypt3:
28922787c51Stb	_CET_ENDBR
2906249468aSthib	$movkey	($key),$rndkey0
2916249468aSthib	shr	\$1,$rounds
2926249468aSthib	$movkey	16($key),$rndkey1
2936249468aSthib	lea	32($key),$key
2945cdd308eSdjm	xorps	$rndkey0,$inout0
2955cdd308eSdjm	xorps	$rndkey0,$inout1
2965cdd308eSdjm	xorps	$rndkey0,$inout2
2975cdd308eSdjm	$movkey		($key),$rndkey0
2986249468aSthib
2996249468aSthib.L${dir}_loop3:
3006249468aSthib	aes${dir}	$rndkey1,$inout0
3016249468aSthib	aes${dir}	$rndkey1,$inout1
3026249468aSthib	dec		$rounds
3036249468aSthib	aes${dir}	$rndkey1,$inout2
3046249468aSthib	$movkey		16($key),$rndkey1
3055cdd308eSdjm	aes${dir}	$rndkey0,$inout0
3066249468aSthib	aes${dir}	$rndkey0,$inout1
3076249468aSthib	lea		32($key),$key
3086249468aSthib	aes${dir}	$rndkey0,$inout2
3095cdd308eSdjm	$movkey		($key),$rndkey0
3106249468aSthib	jnz		.L${dir}_loop3
3116249468aSthib
3126249468aSthib	aes${dir}	$rndkey1,$inout0
3136249468aSthib	aes${dir}	$rndkey1,$inout1
3146249468aSthib	aes${dir}	$rndkey1,$inout2
3156249468aSthib	aes${dir}last	$rndkey0,$inout0
3166249468aSthib	aes${dir}last	$rndkey0,$inout1
3176249468aSthib	aes${dir}last	$rndkey0,$inout2
3186249468aSthib	ret
3196249468aSthib.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
3206249468aSthib___
3216249468aSthib}
3226249468aSthib# 4x interleave is implemented to improve small block performance,
3236249468aSthib# most notably [and naturally] 4 block by ~30%. One can argue that one
3246249468aSthib# should have implemented 5x as well, but improvement would be <20%,
3256249468aSthib# so it's not worth it...
3266249468aSthibsub aesni_generate4 {
3276249468aSthibmy $dir=shift;
3286249468aSthib# As already mentioned it takes in $key and $rounds, which are *not*
3296249468aSthib# preserved. $inout[0-3] is cipher/clear text...
3306249468aSthib$code.=<<___;
3316249468aSthib.type	_aesni_${dir}rypt4,\@abi-omnipotent
3326249468aSthib.align	16
3336249468aSthib_aesni_${dir}rypt4:
33422787c51Stb	_CET_ENDBR
3356249468aSthib	$movkey	($key),$rndkey0
3366249468aSthib	shr	\$1,$rounds
3376249468aSthib	$movkey	16($key),$rndkey1
3386249468aSthib	lea	32($key),$key
3395cdd308eSdjm	xorps	$rndkey0,$inout0
3405cdd308eSdjm	xorps	$rndkey0,$inout1
3415cdd308eSdjm	xorps	$rndkey0,$inout2
3425cdd308eSdjm	xorps	$rndkey0,$inout3
3435cdd308eSdjm	$movkey	($key),$rndkey0
3446249468aSthib
3456249468aSthib.L${dir}_loop4:
3466249468aSthib	aes${dir}	$rndkey1,$inout0
3476249468aSthib	aes${dir}	$rndkey1,$inout1
3486249468aSthib	dec		$rounds
3496249468aSthib	aes${dir}	$rndkey1,$inout2
3506249468aSthib	aes${dir}	$rndkey1,$inout3
3516249468aSthib	$movkey		16($key),$rndkey1
3525cdd308eSdjm	aes${dir}	$rndkey0,$inout0
3536249468aSthib	aes${dir}	$rndkey0,$inout1
3546249468aSthib	lea		32($key),$key
3556249468aSthib	aes${dir}	$rndkey0,$inout2
3566249468aSthib	aes${dir}	$rndkey0,$inout3
3575cdd308eSdjm	$movkey		($key),$rndkey0
3586249468aSthib	jnz		.L${dir}_loop4
3596249468aSthib
3606249468aSthib	aes${dir}	$rndkey1,$inout0
3616249468aSthib	aes${dir}	$rndkey1,$inout1
3626249468aSthib	aes${dir}	$rndkey1,$inout2
3636249468aSthib	aes${dir}	$rndkey1,$inout3
3646249468aSthib	aes${dir}last	$rndkey0,$inout0
3656249468aSthib	aes${dir}last	$rndkey0,$inout1
3666249468aSthib	aes${dir}last	$rndkey0,$inout2
3676249468aSthib	aes${dir}last	$rndkey0,$inout3
3686249468aSthib	ret
3696249468aSthib.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
3706249468aSthib___
3716249468aSthib}
3725cdd308eSdjmsub aesni_generate6 {
3735cdd308eSdjmmy $dir=shift;
3745cdd308eSdjm# As already mentioned it takes in $key and $rounds, which are *not*
3755cdd308eSdjm# preserved. $inout[0-5] is cipher/clear text...
3765cdd308eSdjm$code.=<<___;
3775cdd308eSdjm.type	_aesni_${dir}rypt6,\@abi-omnipotent
3785cdd308eSdjm.align	16
3795cdd308eSdjm_aesni_${dir}rypt6:
38022787c51Stb	_CET_ENDBR
3815cdd308eSdjm	$movkey		($key),$rndkey0
3825cdd308eSdjm	shr		\$1,$rounds
3835cdd308eSdjm	$movkey		16($key),$rndkey1
3845cdd308eSdjm	lea		32($key),$key
3855cdd308eSdjm	xorps		$rndkey0,$inout0
3865cdd308eSdjm	pxor		$rndkey0,$inout1
3875cdd308eSdjm	aes${dir}	$rndkey1,$inout0
3885cdd308eSdjm	pxor		$rndkey0,$inout2
3895cdd308eSdjm	aes${dir}	$rndkey1,$inout1
3905cdd308eSdjm	pxor		$rndkey0,$inout3
3915cdd308eSdjm	aes${dir}	$rndkey1,$inout2
3925cdd308eSdjm	pxor		$rndkey0,$inout4
3935cdd308eSdjm	aes${dir}	$rndkey1,$inout3
3945cdd308eSdjm	pxor		$rndkey0,$inout5
3955cdd308eSdjm	dec		$rounds
3965cdd308eSdjm	aes${dir}	$rndkey1,$inout4
3975cdd308eSdjm	$movkey		($key),$rndkey0
3985cdd308eSdjm	aes${dir}	$rndkey1,$inout5
3995cdd308eSdjm	jmp		.L${dir}_loop6_enter
4005cdd308eSdjm.align	16
4015cdd308eSdjm.L${dir}_loop6:
4025cdd308eSdjm	aes${dir}	$rndkey1,$inout0
4035cdd308eSdjm	aes${dir}	$rndkey1,$inout1
4045cdd308eSdjm	dec		$rounds
4055cdd308eSdjm	aes${dir}	$rndkey1,$inout2
4065cdd308eSdjm	aes${dir}	$rndkey1,$inout3
4075cdd308eSdjm	aes${dir}	$rndkey1,$inout4
4085cdd308eSdjm	aes${dir}	$rndkey1,$inout5
4095cdd308eSdjm.L${dir}_loop6_enter:				# happens to be 16-byte aligned
4105cdd308eSdjm	$movkey		16($key),$rndkey1
4115cdd308eSdjm	aes${dir}	$rndkey0,$inout0
4125cdd308eSdjm	aes${dir}	$rndkey0,$inout1
4135cdd308eSdjm	lea		32($key),$key
4145cdd308eSdjm	aes${dir}	$rndkey0,$inout2
4155cdd308eSdjm	aes${dir}	$rndkey0,$inout3
4165cdd308eSdjm	aes${dir}	$rndkey0,$inout4
4175cdd308eSdjm	aes${dir}	$rndkey0,$inout5
4185cdd308eSdjm	$movkey		($key),$rndkey0
4195cdd308eSdjm	jnz		.L${dir}_loop6
4205cdd308eSdjm
4215cdd308eSdjm	aes${dir}	$rndkey1,$inout0
4225cdd308eSdjm	aes${dir}	$rndkey1,$inout1
4235cdd308eSdjm	aes${dir}	$rndkey1,$inout2
4245cdd308eSdjm	aes${dir}	$rndkey1,$inout3
4255cdd308eSdjm	aes${dir}	$rndkey1,$inout4
4265cdd308eSdjm	aes${dir}	$rndkey1,$inout5
4275cdd308eSdjm	aes${dir}last	$rndkey0,$inout0
4285cdd308eSdjm	aes${dir}last	$rndkey0,$inout1
4295cdd308eSdjm	aes${dir}last	$rndkey0,$inout2
4305cdd308eSdjm	aes${dir}last	$rndkey0,$inout3
4315cdd308eSdjm	aes${dir}last	$rndkey0,$inout4
4325cdd308eSdjm	aes${dir}last	$rndkey0,$inout5
4335cdd308eSdjm	ret
4345cdd308eSdjm.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
4355cdd308eSdjm___
4365cdd308eSdjm}
4375cdd308eSdjmsub aesni_generate8 {
4385cdd308eSdjmmy $dir=shift;
4395cdd308eSdjm# As already mentioned it takes in $key and $rounds, which are *not*
4405cdd308eSdjm# preserved. $inout[0-7] is cipher/clear text...
4415cdd308eSdjm$code.=<<___;
4425cdd308eSdjm.type	_aesni_${dir}rypt8,\@abi-omnipotent
4435cdd308eSdjm.align	16
4445cdd308eSdjm_aesni_${dir}rypt8:
44522787c51Stb	_CET_ENDBR
4465cdd308eSdjm	$movkey		($key),$rndkey0
4475cdd308eSdjm	shr		\$1,$rounds
4485cdd308eSdjm	$movkey		16($key),$rndkey1
4495cdd308eSdjm	lea		32($key),$key
4505cdd308eSdjm	xorps		$rndkey0,$inout0
4515cdd308eSdjm	xorps		$rndkey0,$inout1
4525cdd308eSdjm	aes${dir}	$rndkey1,$inout0
4535cdd308eSdjm	pxor		$rndkey0,$inout2
4545cdd308eSdjm	aes${dir}	$rndkey1,$inout1
4555cdd308eSdjm	pxor		$rndkey0,$inout3
4565cdd308eSdjm	aes${dir}	$rndkey1,$inout2
4575cdd308eSdjm	pxor		$rndkey0,$inout4
4585cdd308eSdjm	aes${dir}	$rndkey1,$inout3
4595cdd308eSdjm	pxor		$rndkey0,$inout5
4605cdd308eSdjm	dec		$rounds
4615cdd308eSdjm	aes${dir}	$rndkey1,$inout4
4625cdd308eSdjm	pxor		$rndkey0,$inout6
4635cdd308eSdjm	aes${dir}	$rndkey1,$inout5
4645cdd308eSdjm	pxor		$rndkey0,$inout7
4655cdd308eSdjm	$movkey		($key),$rndkey0
4665cdd308eSdjm	aes${dir}	$rndkey1,$inout6
4675cdd308eSdjm	aes${dir}	$rndkey1,$inout7
4685cdd308eSdjm	$movkey		16($key),$rndkey1
4695cdd308eSdjm	jmp		.L${dir}_loop8_enter
4705cdd308eSdjm.align	16
4715cdd308eSdjm.L${dir}_loop8:
4725cdd308eSdjm	aes${dir}	$rndkey1,$inout0
4735cdd308eSdjm	aes${dir}	$rndkey1,$inout1
4745cdd308eSdjm	dec		$rounds
4755cdd308eSdjm	aes${dir}	$rndkey1,$inout2
4765cdd308eSdjm	aes${dir}	$rndkey1,$inout3
4775cdd308eSdjm	aes${dir}	$rndkey1,$inout4
4785cdd308eSdjm	aes${dir}	$rndkey1,$inout5
4795cdd308eSdjm	aes${dir}	$rndkey1,$inout6
4805cdd308eSdjm	aes${dir}	$rndkey1,$inout7
4815cdd308eSdjm	$movkey		16($key),$rndkey1
4825cdd308eSdjm.L${dir}_loop8_enter:				# happens to be 16-byte aligned
4835cdd308eSdjm	aes${dir}	$rndkey0,$inout0
4845cdd308eSdjm	aes${dir}	$rndkey0,$inout1
4855cdd308eSdjm	lea		32($key),$key
4865cdd308eSdjm	aes${dir}	$rndkey0,$inout2
4875cdd308eSdjm	aes${dir}	$rndkey0,$inout3
4885cdd308eSdjm	aes${dir}	$rndkey0,$inout4
4895cdd308eSdjm	aes${dir}	$rndkey0,$inout5
4905cdd308eSdjm	aes${dir}	$rndkey0,$inout6
4915cdd308eSdjm	aes${dir}	$rndkey0,$inout7
4925cdd308eSdjm	$movkey		($key),$rndkey0
4935cdd308eSdjm	jnz		.L${dir}_loop8
4945cdd308eSdjm
4955cdd308eSdjm	aes${dir}	$rndkey1,$inout0
4965cdd308eSdjm	aes${dir}	$rndkey1,$inout1
4975cdd308eSdjm	aes${dir}	$rndkey1,$inout2
4985cdd308eSdjm	aes${dir}	$rndkey1,$inout3
4995cdd308eSdjm	aes${dir}	$rndkey1,$inout4
5005cdd308eSdjm	aes${dir}	$rndkey1,$inout5
5015cdd308eSdjm	aes${dir}	$rndkey1,$inout6
5025cdd308eSdjm	aes${dir}	$rndkey1,$inout7
5035cdd308eSdjm	aes${dir}last	$rndkey0,$inout0
5045cdd308eSdjm	aes${dir}last	$rndkey0,$inout1
5055cdd308eSdjm	aes${dir}last	$rndkey0,$inout2
5065cdd308eSdjm	aes${dir}last	$rndkey0,$inout3
5075cdd308eSdjm	aes${dir}last	$rndkey0,$inout4
5085cdd308eSdjm	aes${dir}last	$rndkey0,$inout5
5095cdd308eSdjm	aes${dir}last	$rndkey0,$inout6
5105cdd308eSdjm	aes${dir}last	$rndkey0,$inout7
5115cdd308eSdjm	ret
5125cdd308eSdjm.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
5135cdd308eSdjm___
5145cdd308eSdjm}
5156249468aSthib&aesni_generate3("enc") if ($PREFIX eq "aesni");
5166249468aSthib&aesni_generate3("dec");
5176249468aSthib&aesni_generate4("enc") if ($PREFIX eq "aesni");
5186249468aSthib&aesni_generate4("dec");
5195cdd308eSdjm&aesni_generate6("enc") if ($PREFIX eq "aesni");
5205cdd308eSdjm&aesni_generate6("dec");
5215cdd308eSdjm&aesni_generate8("enc") if ($PREFIX eq "aesni");
5225cdd308eSdjm&aesni_generate8("dec");
5236249468aSthib
5246249468aSthibif ($PREFIX eq "aesni") {
5255cdd308eSdjm########################################################################
5266249468aSthib# void aesni_ecb_encrypt (const void *in, void *out,
5276249468aSthib#			  size_t length, const AES_KEY *key,
5286249468aSthib#			  int enc);
5296249468aSthib$code.=<<___;
5306249468aSthib.globl	aesni_ecb_encrypt
5316249468aSthib.type	aesni_ecb_encrypt,\@function,5
5326249468aSthib.align	16
5336249468aSthibaesni_ecb_encrypt:
53422787c51Stb	_CET_ENDBR
5356249468aSthib	and	\$-16,$len
5366249468aSthib	jz	.Lecb_ret
5376249468aSthib
5385cdd308eSdjm	mov	240($key),$rounds	# key->rounds
5395cdd308eSdjm	$movkey	($key),$rndkey0
5405cdd308eSdjm	mov	$key,$key_		# backup $key
5415cdd308eSdjm	mov	$rounds,$rnds_		# backup $rounds
5425cdd308eSdjm	test	%r8d,%r8d		# 5th argument
5435cdd308eSdjm	jz	.Lecb_decrypt
5445cdd308eSdjm#--------------------------- ECB ENCRYPT ------------------------------#
5455cdd308eSdjm	cmp	\$0x80,$len
5465cdd308eSdjm	jb	.Lecb_enc_tail
5475cdd308eSdjm
5485cdd308eSdjm	movdqu	($inp),$inout0
5495cdd308eSdjm	movdqu	0x10($inp),$inout1
5505cdd308eSdjm	movdqu	0x20($inp),$inout2
5515cdd308eSdjm	movdqu	0x30($inp),$inout3
5525cdd308eSdjm	movdqu	0x40($inp),$inout4
5535cdd308eSdjm	movdqu	0x50($inp),$inout5
5545cdd308eSdjm	movdqu	0x60($inp),$inout6
5555cdd308eSdjm	movdqu	0x70($inp),$inout7
5565cdd308eSdjm	lea	0x80($inp),$inp
5575cdd308eSdjm	sub	\$0x80,$len
5585cdd308eSdjm	jmp	.Lecb_enc_loop8_enter
5595cdd308eSdjm.align 16
5605cdd308eSdjm.Lecb_enc_loop8:
5615cdd308eSdjm	movups	$inout0,($out)
5625cdd308eSdjm	mov	$key_,$key		# restore $key
5635cdd308eSdjm	movdqu	($inp),$inout0
5645cdd308eSdjm	mov	$rnds_,$rounds		# restore $rounds
5655cdd308eSdjm	movups	$inout1,0x10($out)
5665cdd308eSdjm	movdqu	0x10($inp),$inout1
5675cdd308eSdjm	movups	$inout2,0x20($out)
5685cdd308eSdjm	movdqu	0x20($inp),$inout2
5695cdd308eSdjm	movups	$inout3,0x30($out)
5705cdd308eSdjm	movdqu	0x30($inp),$inout3
5715cdd308eSdjm	movups	$inout4,0x40($out)
5725cdd308eSdjm	movdqu	0x40($inp),$inout4
5735cdd308eSdjm	movups	$inout5,0x50($out)
5745cdd308eSdjm	movdqu	0x50($inp),$inout5
5755cdd308eSdjm	movups	$inout6,0x60($out)
5765cdd308eSdjm	movdqu	0x60($inp),$inout6
5775cdd308eSdjm	movups	$inout7,0x70($out)
5785cdd308eSdjm	lea	0x80($out),$out
5795cdd308eSdjm	movdqu	0x70($inp),$inout7
5805cdd308eSdjm	lea	0x80($inp),$inp
5815cdd308eSdjm.Lecb_enc_loop8_enter:
5825cdd308eSdjm
5835cdd308eSdjm	call	_aesni_encrypt8
5845cdd308eSdjm
5855cdd308eSdjm	sub	\$0x80,$len
5865cdd308eSdjm	jnc	.Lecb_enc_loop8
5875cdd308eSdjm
5885cdd308eSdjm	movups	$inout0,($out)
5895cdd308eSdjm	mov	$key_,$key		# restore $key
5905cdd308eSdjm	movups	$inout1,0x10($out)
5915cdd308eSdjm	mov	$rnds_,$rounds		# restore $rounds
5925cdd308eSdjm	movups	$inout2,0x20($out)
5935cdd308eSdjm	movups	$inout3,0x30($out)
5945cdd308eSdjm	movups	$inout4,0x40($out)
5955cdd308eSdjm	movups	$inout5,0x50($out)
5965cdd308eSdjm	movups	$inout6,0x60($out)
5975cdd308eSdjm	movups	$inout7,0x70($out)
5985cdd308eSdjm	lea	0x80($out),$out
5995cdd308eSdjm	add	\$0x80,$len
6005cdd308eSdjm	jz	.Lecb_ret
6015cdd308eSdjm
6025cdd308eSdjm.Lecb_enc_tail:
6036249468aSthib	movups	($inp),$inout0
6046249468aSthib	cmp	\$0x20,$len
6055cdd308eSdjm	jb	.Lecb_enc_one
6066249468aSthib	movups	0x10($inp),$inout1
6076249468aSthib	je	.Lecb_enc_two
6086249468aSthib	movups	0x20($inp),$inout2
6095cdd308eSdjm	cmp	\$0x40,$len
6105cdd308eSdjm	jb	.Lecb_enc_three
6116249468aSthib	movups	0x30($inp),$inout3
6125cdd308eSdjm	je	.Lecb_enc_four
6135cdd308eSdjm	movups	0x40($inp),$inout4
6145cdd308eSdjm	cmp	\$0x60,$len
6155cdd308eSdjm	jb	.Lecb_enc_five
6165cdd308eSdjm	movups	0x50($inp),$inout5
6175cdd308eSdjm	je	.Lecb_enc_six
6185cdd308eSdjm	movdqu	0x60($inp),$inout6
6195cdd308eSdjm	call	_aesni_encrypt8
6206249468aSthib	movups	$inout0,($out)
6216249468aSthib	movups	$inout1,0x10($out)
6226249468aSthib	movups	$inout2,0x20($out)
6236249468aSthib	movups	$inout3,0x30($out)
6245cdd308eSdjm	movups	$inout4,0x40($out)
6255cdd308eSdjm	movups	$inout5,0x50($out)
6265cdd308eSdjm	movups	$inout6,0x60($out)
6276249468aSthib	jmp	.Lecb_ret
6286249468aSthib.align	16
6296249468aSthib.Lecb_enc_one:
6306249468aSthib___
6316249468aSthib	&aesni_generate1("enc",$key,$rounds);
6326249468aSthib$code.=<<___;
6336249468aSthib	movups	$inout0,($out)
6346249468aSthib	jmp	.Lecb_ret
6356249468aSthib.align	16
6366249468aSthib.Lecb_enc_two:
6375cdd308eSdjm	xorps	$inout2,$inout2
6386249468aSthib	call	_aesni_encrypt3
6396249468aSthib	movups	$inout0,($out)
6406249468aSthib	movups	$inout1,0x10($out)
6416249468aSthib	jmp	.Lecb_ret
6426249468aSthib.align	16
6436249468aSthib.Lecb_enc_three:
6446249468aSthib	call	_aesni_encrypt3
6456249468aSthib	movups	$inout0,($out)
6466249468aSthib	movups	$inout1,0x10($out)
6476249468aSthib	movups	$inout2,0x20($out)
6486249468aSthib	jmp	.Lecb_ret
6496249468aSthib.align	16
6505cdd308eSdjm.Lecb_enc_four:
6515cdd308eSdjm	call	_aesni_encrypt4
6526249468aSthib	movups	$inout0,($out)
6536249468aSthib	movups	$inout1,0x10($out)
6546249468aSthib	movups	$inout2,0x20($out)
6556249468aSthib	movups	$inout3,0x30($out)
6566249468aSthib	jmp	.Lecb_ret
6576249468aSthib.align	16
6585cdd308eSdjm.Lecb_enc_five:
6595cdd308eSdjm	xorps	$inout5,$inout5
6605cdd308eSdjm	call	_aesni_encrypt6
6615cdd308eSdjm	movups	$inout0,($out)
6625cdd308eSdjm	movups	$inout1,0x10($out)
6635cdd308eSdjm	movups	$inout2,0x20($out)
6645cdd308eSdjm	movups	$inout3,0x30($out)
6655cdd308eSdjm	movups	$inout4,0x40($out)
6665cdd308eSdjm	jmp	.Lecb_ret
6675cdd308eSdjm.align	16
6685cdd308eSdjm.Lecb_enc_six:
6695cdd308eSdjm	call	_aesni_encrypt6
6705cdd308eSdjm	movups	$inout0,($out)
6715cdd308eSdjm	movups	$inout1,0x10($out)
6725cdd308eSdjm	movups	$inout2,0x20($out)
6735cdd308eSdjm	movups	$inout3,0x30($out)
6745cdd308eSdjm	movups	$inout4,0x40($out)
6755cdd308eSdjm	movups	$inout5,0x50($out)
6765cdd308eSdjm	jmp	.Lecb_ret
6775cdd308eSdjm#--------------------------- ECB DECRYPT ------------------------------#
6785cdd308eSdjm.align	16
6795cdd308eSdjm.Lecb_decrypt:
6805cdd308eSdjm	cmp	\$0x80,$len
6815cdd308eSdjm	jb	.Lecb_dec_tail
6825cdd308eSdjm
6835cdd308eSdjm	movdqu	($inp),$inout0
6845cdd308eSdjm	movdqu	0x10($inp),$inout1
6855cdd308eSdjm	movdqu	0x20($inp),$inout2
6865cdd308eSdjm	movdqu	0x30($inp),$inout3
6875cdd308eSdjm	movdqu	0x40($inp),$inout4
6885cdd308eSdjm	movdqu	0x50($inp),$inout5
6895cdd308eSdjm	movdqu	0x60($inp),$inout6
6905cdd308eSdjm	movdqu	0x70($inp),$inout7
6915cdd308eSdjm	lea	0x80($inp),$inp
6925cdd308eSdjm	sub	\$0x80,$len
6935cdd308eSdjm	jmp	.Lecb_dec_loop8_enter
6945cdd308eSdjm.align 16
6955cdd308eSdjm.Lecb_dec_loop8:
6965cdd308eSdjm	movups	$inout0,($out)
6975cdd308eSdjm	mov	$key_,$key		# restore $key
6985cdd308eSdjm	movdqu	($inp),$inout0
6995cdd308eSdjm	mov	$rnds_,$rounds		# restore $rounds
7005cdd308eSdjm	movups	$inout1,0x10($out)
7015cdd308eSdjm	movdqu	0x10($inp),$inout1
7025cdd308eSdjm	movups	$inout2,0x20($out)
7035cdd308eSdjm	movdqu	0x20($inp),$inout2
7045cdd308eSdjm	movups	$inout3,0x30($out)
7055cdd308eSdjm	movdqu	0x30($inp),$inout3
7065cdd308eSdjm	movups	$inout4,0x40($out)
7075cdd308eSdjm	movdqu	0x40($inp),$inout4
7085cdd308eSdjm	movups	$inout5,0x50($out)
7095cdd308eSdjm	movdqu	0x50($inp),$inout5
7105cdd308eSdjm	movups	$inout6,0x60($out)
7115cdd308eSdjm	movdqu	0x60($inp),$inout6
7125cdd308eSdjm	movups	$inout7,0x70($out)
7135cdd308eSdjm	lea	0x80($out),$out
7145cdd308eSdjm	movdqu	0x70($inp),$inout7
7155cdd308eSdjm	lea	0x80($inp),$inp
7165cdd308eSdjm.Lecb_dec_loop8_enter:
7175cdd308eSdjm
7185cdd308eSdjm	call	_aesni_decrypt8
7195cdd308eSdjm
7205cdd308eSdjm	$movkey	($key_),$rndkey0
7215cdd308eSdjm	sub	\$0x80,$len
7225cdd308eSdjm	jnc	.Lecb_dec_loop8
7235cdd308eSdjm
7245cdd308eSdjm	movups	$inout0,($out)
7255cdd308eSdjm	mov	$key_,$key		# restore $key
7265cdd308eSdjm	movups	$inout1,0x10($out)
7275cdd308eSdjm	mov	$rnds_,$rounds		# restore $rounds
7285cdd308eSdjm	movups	$inout2,0x20($out)
7295cdd308eSdjm	movups	$inout3,0x30($out)
7305cdd308eSdjm	movups	$inout4,0x40($out)
7315cdd308eSdjm	movups	$inout5,0x50($out)
7325cdd308eSdjm	movups	$inout6,0x60($out)
7335cdd308eSdjm	movups	$inout7,0x70($out)
7345cdd308eSdjm	lea	0x80($out),$out
7355cdd308eSdjm	add	\$0x80,$len
7365cdd308eSdjm	jz	.Lecb_ret
7375cdd308eSdjm
7385cdd308eSdjm.Lecb_dec_tail:
7395cdd308eSdjm	movups	($inp),$inout0
7405cdd308eSdjm	cmp	\$0x20,$len
7415cdd308eSdjm	jb	.Lecb_dec_one
7425cdd308eSdjm	movups	0x10($inp),$inout1
7435cdd308eSdjm	je	.Lecb_dec_two
7445cdd308eSdjm	movups	0x20($inp),$inout2
7455cdd308eSdjm	cmp	\$0x40,$len
7465cdd308eSdjm	jb	.Lecb_dec_three
7475cdd308eSdjm	movups	0x30($inp),$inout3
7485cdd308eSdjm	je	.Lecb_dec_four
7495cdd308eSdjm	movups	0x40($inp),$inout4
7505cdd308eSdjm	cmp	\$0x60,$len
7515cdd308eSdjm	jb	.Lecb_dec_five
7525cdd308eSdjm	movups	0x50($inp),$inout5
7535cdd308eSdjm	je	.Lecb_dec_six
7545cdd308eSdjm	movups	0x60($inp),$inout6
7555cdd308eSdjm	$movkey	($key),$rndkey0
7565cdd308eSdjm	call	_aesni_decrypt8
7575cdd308eSdjm	movups	$inout0,($out)
7585cdd308eSdjm	movups	$inout1,0x10($out)
7595cdd308eSdjm	movups	$inout2,0x20($out)
7605cdd308eSdjm	movups	$inout3,0x30($out)
7615cdd308eSdjm	movups	$inout4,0x40($out)
7625cdd308eSdjm	movups	$inout5,0x50($out)
7635cdd308eSdjm	movups	$inout6,0x60($out)
7645cdd308eSdjm	jmp	.Lecb_ret
7655cdd308eSdjm.align	16
7666249468aSthib.Lecb_dec_one:
7676249468aSthib___
7686249468aSthib	&aesni_generate1("dec",$key,$rounds);
7696249468aSthib$code.=<<___;
7706249468aSthib	movups	$inout0,($out)
7716249468aSthib	jmp	.Lecb_ret
7726249468aSthib.align	16
7736249468aSthib.Lecb_dec_two:
7745cdd308eSdjm	xorps	$inout2,$inout2
7756249468aSthib	call	_aesni_decrypt3
7766249468aSthib	movups	$inout0,($out)
7776249468aSthib	movups	$inout1,0x10($out)
7786249468aSthib	jmp	.Lecb_ret
7796249468aSthib.align	16
7806249468aSthib.Lecb_dec_three:
7816249468aSthib	call	_aesni_decrypt3
7826249468aSthib	movups	$inout0,($out)
7836249468aSthib	movups	$inout1,0x10($out)
7846249468aSthib	movups	$inout2,0x20($out)
7855cdd308eSdjm	jmp	.Lecb_ret
7865cdd308eSdjm.align	16
7875cdd308eSdjm.Lecb_dec_four:
7885cdd308eSdjm	call	_aesni_decrypt4
7895cdd308eSdjm	movups	$inout0,($out)
7905cdd308eSdjm	movups	$inout1,0x10($out)
7915cdd308eSdjm	movups	$inout2,0x20($out)
7925cdd308eSdjm	movups	$inout3,0x30($out)
7935cdd308eSdjm	jmp	.Lecb_ret
7945cdd308eSdjm.align	16
7955cdd308eSdjm.Lecb_dec_five:
7965cdd308eSdjm	xorps	$inout5,$inout5
7975cdd308eSdjm	call	_aesni_decrypt6
7985cdd308eSdjm	movups	$inout0,($out)
7995cdd308eSdjm	movups	$inout1,0x10($out)
8005cdd308eSdjm	movups	$inout2,0x20($out)
8015cdd308eSdjm	movups	$inout3,0x30($out)
8025cdd308eSdjm	movups	$inout4,0x40($out)
8035cdd308eSdjm	jmp	.Lecb_ret
8045cdd308eSdjm.align	16
8055cdd308eSdjm.Lecb_dec_six:
8065cdd308eSdjm	call	_aesni_decrypt6
8075cdd308eSdjm	movups	$inout0,($out)
8085cdd308eSdjm	movups	$inout1,0x10($out)
8095cdd308eSdjm	movups	$inout2,0x20($out)
8105cdd308eSdjm	movups	$inout3,0x30($out)
8115cdd308eSdjm	movups	$inout4,0x40($out)
8125cdd308eSdjm	movups	$inout5,0x50($out)
8136249468aSthib
8146249468aSthib.Lecb_ret:
8156249468aSthib	ret
8166249468aSthib.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
8176249468aSthib___
8185cdd308eSdjm
8195cdd308eSdjm{
8205cdd308eSdjm######################################################################
8215cdd308eSdjm# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
8225cdd308eSdjm#                         size_t blocks, const AES_KEY *key,
8235cdd308eSdjm#                         const char *ivec,char *cmac);
8245cdd308eSdjm#
8255cdd308eSdjm# Handles only complete blocks, operates on 64-bit counter and
8265cdd308eSdjm# does not update *ivec! Nor does it finalize CMAC value
8275cdd308eSdjm# (see engine/eng_aesni.c for details)
8285cdd308eSdjm#
8295cdd308eSdjm{
8305cdd308eSdjmmy $cmac="%r9";	# 6th argument
8315cdd308eSdjm
8325cdd308eSdjmmy $increment="%xmm6";
8335cdd308eSdjmmy $bswap_mask="%xmm7";
8345cdd308eSdjm
8355cdd308eSdjm$code.=<<___;
8365cdd308eSdjm.globl	aesni_ccm64_encrypt_blocks
8375cdd308eSdjm.type	aesni_ccm64_encrypt_blocks,\@function,6
8385cdd308eSdjm.align	16
8395cdd308eSdjmaesni_ccm64_encrypt_blocks:
84022787c51Stb	_CET_ENDBR
8415cdd308eSdjm___
8425cdd308eSdjm$code.=<<___ if ($win64);
8435cdd308eSdjm	lea	-0x58(%rsp),%rsp
8445cdd308eSdjm	movaps	%xmm6,(%rsp)
8455cdd308eSdjm	movaps	%xmm7,0x10(%rsp)
8465cdd308eSdjm	movaps	%xmm8,0x20(%rsp)
8475cdd308eSdjm	movaps	%xmm9,0x30(%rsp)
8485cdd308eSdjm.Lccm64_enc_body:
8495cdd308eSdjm___
8505cdd308eSdjm$code.=<<___;
8515cdd308eSdjm	mov	240($key),$rounds		# key->rounds
8525cdd308eSdjm	movdqu	($ivp),$iv
8535cdd308eSdjm	movdqa	.Lincrement64(%rip),$increment
8545cdd308eSdjm	movdqa	.Lbswap_mask(%rip),$bswap_mask
8555cdd308eSdjm
8565cdd308eSdjm	shr	\$1,$rounds
8575cdd308eSdjm	lea	0($key),$key_
8585cdd308eSdjm	movdqu	($cmac),$inout1
8595cdd308eSdjm	movdqa	$iv,$inout0
8605cdd308eSdjm	mov	$rounds,$rnds_
8615cdd308eSdjm	pshufb	$bswap_mask,$iv
8625cdd308eSdjm	jmp	.Lccm64_enc_outer
8635cdd308eSdjm.align	16
8645cdd308eSdjm.Lccm64_enc_outer:
8655cdd308eSdjm	$movkey	($key_),$rndkey0
8665cdd308eSdjm	mov	$rnds_,$rounds
8675cdd308eSdjm	movups	($inp),$in0			# load inp
8685cdd308eSdjm
8695cdd308eSdjm	xorps	$rndkey0,$inout0		# counter
8705cdd308eSdjm	$movkey	16($key_),$rndkey1
8715cdd308eSdjm	xorps	$in0,$rndkey0
8725cdd308eSdjm	lea	32($key_),$key
8735cdd308eSdjm	xorps	$rndkey0,$inout1		# cmac^=inp
8745cdd308eSdjm	$movkey	($key),$rndkey0
8755cdd308eSdjm
8765cdd308eSdjm.Lccm64_enc2_loop:
8775cdd308eSdjm	aesenc	$rndkey1,$inout0
8785cdd308eSdjm	dec	$rounds
8795cdd308eSdjm	aesenc	$rndkey1,$inout1
8805cdd308eSdjm	$movkey	16($key),$rndkey1
8815cdd308eSdjm	aesenc	$rndkey0,$inout0
8825cdd308eSdjm	lea	32($key),$key
8835cdd308eSdjm	aesenc	$rndkey0,$inout1
8845cdd308eSdjm	$movkey	0($key),$rndkey0
8855cdd308eSdjm	jnz	.Lccm64_enc2_loop
8865cdd308eSdjm	aesenc	$rndkey1,$inout0
8875cdd308eSdjm	aesenc	$rndkey1,$inout1
8885cdd308eSdjm	paddq	$increment,$iv
8895cdd308eSdjm	aesenclast	$rndkey0,$inout0
8905cdd308eSdjm	aesenclast	$rndkey0,$inout1
8915cdd308eSdjm
8925cdd308eSdjm	dec	$len
8935cdd308eSdjm	lea	16($inp),$inp
8945cdd308eSdjm	xorps	$inout0,$in0			# inp ^= E(iv)
8955cdd308eSdjm	movdqa	$iv,$inout0
8965cdd308eSdjm	movups	$in0,($out)			# save output
8975cdd308eSdjm	lea	16($out),$out
8985cdd308eSdjm	pshufb	$bswap_mask,$inout0
8995cdd308eSdjm	jnz	.Lccm64_enc_outer
9005cdd308eSdjm
9015cdd308eSdjm	movups	$inout1,($cmac)
9025cdd308eSdjm___
9035cdd308eSdjm$code.=<<___ if ($win64);
9045cdd308eSdjm	movaps	(%rsp),%xmm6
9055cdd308eSdjm	movaps	0x10(%rsp),%xmm7
9065cdd308eSdjm	movaps	0x20(%rsp),%xmm8
9075cdd308eSdjm	movaps	0x30(%rsp),%xmm9
9085cdd308eSdjm	lea	0x58(%rsp),%rsp
9095cdd308eSdjm.Lccm64_enc_ret:
9105cdd308eSdjm___
9115cdd308eSdjm$code.=<<___;
9125cdd308eSdjm	ret
9135cdd308eSdjm.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
9145cdd308eSdjm___
9155cdd308eSdjm######################################################################
9165cdd308eSdjm$code.=<<___;
9175cdd308eSdjm.globl	aesni_ccm64_decrypt_blocks
9185cdd308eSdjm.type	aesni_ccm64_decrypt_blocks,\@function,6
9195cdd308eSdjm.align	16
9205cdd308eSdjmaesni_ccm64_decrypt_blocks:
921*5caf18b2Stb	_CET_ENDBR
9225cdd308eSdjm___
9235cdd308eSdjm$code.=<<___ if ($win64);
9245cdd308eSdjm	lea	-0x58(%rsp),%rsp
9255cdd308eSdjm	movaps	%xmm6,(%rsp)
9265cdd308eSdjm	movaps	%xmm7,0x10(%rsp)
9275cdd308eSdjm	movaps	%xmm8,0x20(%rsp)
9285cdd308eSdjm	movaps	%xmm9,0x30(%rsp)
9295cdd308eSdjm.Lccm64_dec_body:
9305cdd308eSdjm___
9315cdd308eSdjm$code.=<<___;
9325cdd308eSdjm	mov	240($key),$rounds		# key->rounds
9335cdd308eSdjm	movups	($ivp),$iv
9345cdd308eSdjm	movdqu	($cmac),$inout1
9355cdd308eSdjm	movdqa	.Lincrement64(%rip),$increment
9365cdd308eSdjm	movdqa	.Lbswap_mask(%rip),$bswap_mask
9375cdd308eSdjm
9385cdd308eSdjm	movaps	$iv,$inout0
9395cdd308eSdjm	mov	$rounds,$rnds_
9405cdd308eSdjm	mov	$key,$key_
9415cdd308eSdjm	pshufb	$bswap_mask,$iv
9425cdd308eSdjm___
9435cdd308eSdjm	&aesni_generate1("enc",$key,$rounds);
9445cdd308eSdjm$code.=<<___;
9455cdd308eSdjm	movups	($inp),$in0			# load inp
9465cdd308eSdjm	paddq	$increment,$iv
9475cdd308eSdjm	lea	16($inp),$inp
9485cdd308eSdjm	jmp	.Lccm64_dec_outer
9495cdd308eSdjm.align	16
9505cdd308eSdjm.Lccm64_dec_outer:
9515cdd308eSdjm	xorps	$inout0,$in0			# inp ^= E(iv)
9525cdd308eSdjm	movdqa	$iv,$inout0
9535cdd308eSdjm	mov	$rnds_,$rounds
9545cdd308eSdjm	movups	$in0,($out)			# save output
9555cdd308eSdjm	lea	16($out),$out
9565cdd308eSdjm	pshufb	$bswap_mask,$inout0
9575cdd308eSdjm
9585cdd308eSdjm	sub	\$1,$len
9595cdd308eSdjm	jz	.Lccm64_dec_break
9605cdd308eSdjm
9615cdd308eSdjm	$movkey	($key_),$rndkey0
9625cdd308eSdjm	shr	\$1,$rounds
9635cdd308eSdjm	$movkey	16($key_),$rndkey1
9645cdd308eSdjm	xorps	$rndkey0,$in0
9655cdd308eSdjm	lea	32($key_),$key
9665cdd308eSdjm	xorps	$rndkey0,$inout0
9675cdd308eSdjm	xorps	$in0,$inout1			# cmac^=out
9685cdd308eSdjm	$movkey	($key),$rndkey0
9695cdd308eSdjm
9705cdd308eSdjm.Lccm64_dec2_loop:
9715cdd308eSdjm	aesenc	$rndkey1,$inout0
9725cdd308eSdjm	dec	$rounds
9735cdd308eSdjm	aesenc	$rndkey1,$inout1
9745cdd308eSdjm	$movkey	16($key),$rndkey1
9755cdd308eSdjm	aesenc	$rndkey0,$inout0
9765cdd308eSdjm	lea	32($key),$key
9775cdd308eSdjm	aesenc	$rndkey0,$inout1
9785cdd308eSdjm	$movkey	0($key),$rndkey0
9795cdd308eSdjm	jnz	.Lccm64_dec2_loop
9805cdd308eSdjm	movups	($inp),$in0			# load inp
9815cdd308eSdjm	paddq	$increment,$iv
9825cdd308eSdjm	aesenc	$rndkey1,$inout0
9835cdd308eSdjm	aesenc	$rndkey1,$inout1
9845cdd308eSdjm	lea	16($inp),$inp
9855cdd308eSdjm	aesenclast	$rndkey0,$inout0
9865cdd308eSdjm	aesenclast	$rndkey0,$inout1
9875cdd308eSdjm	jmp	.Lccm64_dec_outer
9885cdd308eSdjm
9895cdd308eSdjm.align	16
9905cdd308eSdjm.Lccm64_dec_break:
9915cdd308eSdjm	#xorps	$in0,$inout1			# cmac^=out
9925cdd308eSdjm___
9935cdd308eSdjm	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
9945cdd308eSdjm$code.=<<___;
9955cdd308eSdjm	movups	$inout1,($cmac)
9965cdd308eSdjm___
9975cdd308eSdjm$code.=<<___ if ($win64);
9985cdd308eSdjm	movaps	(%rsp),%xmm6
9995cdd308eSdjm	movaps	0x10(%rsp),%xmm7
10005cdd308eSdjm	movaps	0x20(%rsp),%xmm8
10015cdd308eSdjm	movaps	0x30(%rsp),%xmm9
10025cdd308eSdjm	lea	0x58(%rsp),%rsp
10035cdd308eSdjm.Lccm64_dec_ret:
10045cdd308eSdjm___
10055cdd308eSdjm$code.=<<___;
10065cdd308eSdjm	ret
10075cdd308eSdjm.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
10085cdd308eSdjm___
10095cdd308eSdjm}
10105cdd308eSdjm######################################################################
10115cdd308eSdjm# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
10125cdd308eSdjm#                         size_t blocks, const AES_KEY *key,
10135cdd308eSdjm#                         const char *ivec);
10145cdd308eSdjm#
10155cdd308eSdjm# Handles only complete blocks, operates on 32-bit counter and
10165cdd308eSdjm# does not update *ivec! (see engine/eng_aesni.c for details)
10175cdd308eSdjm#
10185cdd308eSdjm{
10195c104365Sjsingmy $frame_size = 0x20+($win64?160:0);
10205cdd308eSdjmmy ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
10215cdd308eSdjmmy ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
10225cdd308eSdjmmy $bswap_mask="%xmm15";
10235cdd308eSdjm
10245cdd308eSdjm$code.=<<___;
10255cdd308eSdjm.globl	aesni_ctr32_encrypt_blocks
10265cdd308eSdjm.type	aesni_ctr32_encrypt_blocks,\@function,5
10275cdd308eSdjm.align	16
10285cdd308eSdjmaesni_ctr32_encrypt_blocks:
102922787c51Stb	_CET_ENDBR
10305c104365Sjsing	lea	(%rsp),%rax
10315c104365Sjsing	push	%rbp
10325c104365Sjsing	sub	\$$frame_size,%rsp
10335cdd308eSdjm___
10345cdd308eSdjm$code.=<<___ if ($win64);
10355cdd308eSdjm	movaps	%xmm6,0x20(%rsp)
10365cdd308eSdjm	movaps	%xmm7,0x30(%rsp)
10375cdd308eSdjm	movaps	%xmm8,0x40(%rsp)
10385cdd308eSdjm	movaps	%xmm9,0x50(%rsp)
10395cdd308eSdjm	movaps	%xmm10,0x60(%rsp)
10405cdd308eSdjm	movaps	%xmm11,0x70(%rsp)
10415cdd308eSdjm	movaps	%xmm12,0x80(%rsp)
10425cdd308eSdjm	movaps	%xmm13,0x90(%rsp)
10435cdd308eSdjm	movaps	%xmm14,0xa0(%rsp)
10445cdd308eSdjm	movaps	%xmm15,0xb0(%rsp)
10455cdd308eSdjm.Lctr32_body:
10465cdd308eSdjm___
10475cdd308eSdjm$code.=<<___;
10485c104365Sjsing	lea	-8(%rax),%rbp
10495cdd308eSdjm	cmp	\$1,$len
10505cdd308eSdjm	je	.Lctr32_one_shortcut
10515cdd308eSdjm
10525cdd308eSdjm	movdqu	($ivp),$ivec
10535cdd308eSdjm	movdqa	.Lbswap_mask(%rip),$bswap_mask
10545cdd308eSdjm	xor	$rounds,$rounds
10555cdd308eSdjm	pextrd	\$3,$ivec,$rnds_		# pull 32-bit counter
10565cdd308eSdjm	pinsrd	\$3,$rounds,$ivec		# wipe 32-bit counter
10575cdd308eSdjm
10585cdd308eSdjm	mov	240($key),$rounds		# key->rounds
10595cdd308eSdjm	bswap	$rnds_
10605cdd308eSdjm	pxor	$iv0,$iv0			# vector of 3 32-bit counters
10615cdd308eSdjm	pxor	$iv1,$iv1			# vector of 3 32-bit counters
10625cdd308eSdjm	pinsrd	\$0,$rnds_,$iv0
10635cdd308eSdjm	lea	3($rnds_),$key_
10645cdd308eSdjm	pinsrd	\$0,$key_,$iv1
10655cdd308eSdjm	inc	$rnds_
10665cdd308eSdjm	pinsrd	\$1,$rnds_,$iv0
10675cdd308eSdjm	inc	$key_
10685cdd308eSdjm	pinsrd	\$1,$key_,$iv1
10695cdd308eSdjm	inc	$rnds_
10705cdd308eSdjm	pinsrd	\$2,$rnds_,$iv0
10715cdd308eSdjm	inc	$key_
10725cdd308eSdjm	pinsrd	\$2,$key_,$iv1
10735c104365Sjsing	movdqa	$iv0,0x00(%rsp)
10745cdd308eSdjm	pshufb	$bswap_mask,$iv0
10755c104365Sjsing	movdqa	$iv1,0x10(%rsp)
10765cdd308eSdjm	pshufb	$bswap_mask,$iv1
10775cdd308eSdjm
10785cdd308eSdjm	pshufd	\$`3<<6`,$iv0,$inout0		# place counter to upper dword
10795cdd308eSdjm	pshufd	\$`2<<6`,$iv0,$inout1
10805cdd308eSdjm	pshufd	\$`1<<6`,$iv0,$inout2
10815cdd308eSdjm	cmp	\$6,$len
10825cdd308eSdjm	jb	.Lctr32_tail
10835cdd308eSdjm	shr	\$1,$rounds
10845cdd308eSdjm	mov	$key,$key_			# backup $key
10855cdd308eSdjm	mov	$rounds,$rnds_			# backup $rounds
10865cdd308eSdjm	sub	\$6,$len
10875cdd308eSdjm	jmp	.Lctr32_loop6
10885cdd308eSdjm
10895cdd308eSdjm.align	16
10905cdd308eSdjm.Lctr32_loop6:
10915cdd308eSdjm	pshufd	\$`3<<6`,$iv1,$inout3
10925cdd308eSdjm	por	$ivec,$inout0			# merge counter-less ivec
10935cdd308eSdjm	 $movkey	($key_),$rndkey0
10945cdd308eSdjm	pshufd	\$`2<<6`,$iv1,$inout4
10955cdd308eSdjm	por	$ivec,$inout1
10965cdd308eSdjm	 $movkey	16($key_),$rndkey1
10975cdd308eSdjm	pshufd	\$`1<<6`,$iv1,$inout5
10985cdd308eSdjm	por	$ivec,$inout2
10995cdd308eSdjm	por	$ivec,$inout3
11005cdd308eSdjm	 xorps		$rndkey0,$inout0
11015cdd308eSdjm	por	$ivec,$inout4
11025cdd308eSdjm	por	$ivec,$inout5
11035cdd308eSdjm
11045cdd308eSdjm	# inline _aesni_encrypt6 and interleave last rounds
11055cdd308eSdjm	# with own code...
11065cdd308eSdjm
11075cdd308eSdjm	pxor		$rndkey0,$inout1
11085cdd308eSdjm	aesenc		$rndkey1,$inout0
11095cdd308eSdjm	lea		32($key_),$key
11105cdd308eSdjm	pxor		$rndkey0,$inout2
11115cdd308eSdjm	aesenc		$rndkey1,$inout1
11125cdd308eSdjm	 movdqa		.Lincrement32(%rip),$iv1
11135cdd308eSdjm	pxor		$rndkey0,$inout3
11145cdd308eSdjm	aesenc		$rndkey1,$inout2
11155c104365Sjsing	 movdqa		(%rsp),$iv0
11165cdd308eSdjm	pxor		$rndkey0,$inout4
11175cdd308eSdjm	aesenc		$rndkey1,$inout3
11185cdd308eSdjm	pxor		$rndkey0,$inout5
11195cdd308eSdjm	$movkey		($key),$rndkey0
11205cdd308eSdjm	dec		$rounds
11215cdd308eSdjm	aesenc		$rndkey1,$inout4
11225cdd308eSdjm	aesenc		$rndkey1,$inout5
11235cdd308eSdjm	jmp		.Lctr32_enc_loop6_enter
11245cdd308eSdjm.align	16
11255cdd308eSdjm.Lctr32_enc_loop6:
11265cdd308eSdjm	aesenc		$rndkey1,$inout0
11275cdd308eSdjm	aesenc		$rndkey1,$inout1
11285cdd308eSdjm	dec		$rounds
11295cdd308eSdjm	aesenc		$rndkey1,$inout2
11305cdd308eSdjm	aesenc		$rndkey1,$inout3
11315cdd308eSdjm	aesenc		$rndkey1,$inout4
11325cdd308eSdjm	aesenc		$rndkey1,$inout5
11335cdd308eSdjm.Lctr32_enc_loop6_enter:
11345cdd308eSdjm	$movkey		16($key),$rndkey1
11355cdd308eSdjm	aesenc		$rndkey0,$inout0
11365cdd308eSdjm	aesenc		$rndkey0,$inout1
11375cdd308eSdjm	lea		32($key),$key
11385cdd308eSdjm	aesenc		$rndkey0,$inout2
11395cdd308eSdjm	aesenc		$rndkey0,$inout3
11405cdd308eSdjm	aesenc		$rndkey0,$inout4
11415cdd308eSdjm	aesenc		$rndkey0,$inout5
11425cdd308eSdjm	$movkey		($key),$rndkey0
11435cdd308eSdjm	jnz		.Lctr32_enc_loop6
11445cdd308eSdjm
11455cdd308eSdjm	aesenc		$rndkey1,$inout0
11465cdd308eSdjm	 paddd		$iv1,$iv0		# increment counter vector
11475cdd308eSdjm	aesenc		$rndkey1,$inout1
11485c104365Sjsing	 paddd		0x10(%rsp),$iv1
11495cdd308eSdjm	aesenc		$rndkey1,$inout2
11505c104365Sjsing	 movdqa		$iv0,0x00(%rsp)		# save counter vector
11515cdd308eSdjm	aesenc		$rndkey1,$inout3
11525c104365Sjsing	 movdqa		$iv1,0x10(%rsp)
11535cdd308eSdjm	aesenc		$rndkey1,$inout4
11545cdd308eSdjm	 pshufb		$bswap_mask,$iv0	# byte swap
11555cdd308eSdjm	aesenc		$rndkey1,$inout5
11565cdd308eSdjm	 pshufb		$bswap_mask,$iv1
11575cdd308eSdjm
11585cdd308eSdjm	aesenclast	$rndkey0,$inout0
11595cdd308eSdjm	 movups		($inp),$in0		# load input
11605cdd308eSdjm	aesenclast	$rndkey0,$inout1
11615cdd308eSdjm	 movups		0x10($inp),$in1
11625cdd308eSdjm	aesenclast	$rndkey0,$inout2
11635cdd308eSdjm	 movups		0x20($inp),$in2
11645cdd308eSdjm	aesenclast	$rndkey0,$inout3
11655cdd308eSdjm	 movups		0x30($inp),$in3
11665cdd308eSdjm	aesenclast	$rndkey0,$inout4
11675cdd308eSdjm	 movups		0x40($inp),$rndkey1
11685cdd308eSdjm	aesenclast	$rndkey0,$inout5
11695cdd308eSdjm	 movups		0x50($inp),$rndkey0
11705cdd308eSdjm	 lea	0x60($inp),$inp
11715cdd308eSdjm
11725cdd308eSdjm	xorps	$inout0,$in0			# xor
11735cdd308eSdjm	 pshufd	\$`3<<6`,$iv0,$inout0
11745cdd308eSdjm	xorps	$inout1,$in1
11755cdd308eSdjm	 pshufd	\$`2<<6`,$iv0,$inout1
11765cdd308eSdjm	movups	$in0,($out)			# store output
11775cdd308eSdjm	xorps	$inout2,$in2
11785cdd308eSdjm	 pshufd	\$`1<<6`,$iv0,$inout2
11795cdd308eSdjm	movups	$in1,0x10($out)
11805cdd308eSdjm	xorps	$inout3,$in3
11815cdd308eSdjm	movups	$in2,0x20($out)
11825cdd308eSdjm	xorps	$inout4,$rndkey1
11835cdd308eSdjm	movups	$in3,0x30($out)
11845cdd308eSdjm	xorps	$inout5,$rndkey0
11855cdd308eSdjm	movups	$rndkey1,0x40($out)
11865cdd308eSdjm	movups	$rndkey0,0x50($out)
11875cdd308eSdjm	lea	0x60($out),$out
11885cdd308eSdjm	mov	$rnds_,$rounds
11895cdd308eSdjm	sub	\$6,$len
11905cdd308eSdjm	jnc	.Lctr32_loop6
11915cdd308eSdjm
11925cdd308eSdjm	add	\$6,$len
11935cdd308eSdjm	jz	.Lctr32_done
11945cdd308eSdjm	mov	$key_,$key			# restore $key
11955cdd308eSdjm	lea	1($rounds,$rounds),$rounds	# restore original value
11965cdd308eSdjm
11975cdd308eSdjm.Lctr32_tail:
11985cdd308eSdjm	por	$ivec,$inout0
11995cdd308eSdjm	movups	($inp),$in0
12005cdd308eSdjm	cmp	\$2,$len
12015cdd308eSdjm	jb	.Lctr32_one
12025cdd308eSdjm
12035cdd308eSdjm	por	$ivec,$inout1
12045cdd308eSdjm	movups	0x10($inp),$in1
12055cdd308eSdjm	je	.Lctr32_two
12065cdd308eSdjm
12075cdd308eSdjm	pshufd	\$`3<<6`,$iv1,$inout3
12085cdd308eSdjm	por	$ivec,$inout2
12095cdd308eSdjm	movups	0x20($inp),$in2
12105cdd308eSdjm	cmp	\$4,$len
12115cdd308eSdjm	jb	.Lctr32_three
12125cdd308eSdjm
12135cdd308eSdjm	pshufd	\$`2<<6`,$iv1,$inout4
12145cdd308eSdjm	por	$ivec,$inout3
12155cdd308eSdjm	movups	0x30($inp),$in3
12165cdd308eSdjm	je	.Lctr32_four
12175cdd308eSdjm
12185cdd308eSdjm	por	$ivec,$inout4
12195cdd308eSdjm	xorps	$inout5,$inout5
12205cdd308eSdjm
12215cdd308eSdjm	call	_aesni_encrypt6
12225cdd308eSdjm
12235cdd308eSdjm	movups	0x40($inp),$rndkey1
12245cdd308eSdjm	xorps	$inout0,$in0
12255cdd308eSdjm	xorps	$inout1,$in1
12265cdd308eSdjm	movups	$in0,($out)
12275cdd308eSdjm	xorps	$inout2,$in2
12285cdd308eSdjm	movups	$in1,0x10($out)
12295cdd308eSdjm	xorps	$inout3,$in3
12305cdd308eSdjm	movups	$in2,0x20($out)
12315cdd308eSdjm	xorps	$inout4,$rndkey1
12325cdd308eSdjm	movups	$in3,0x30($out)
12335cdd308eSdjm	movups	$rndkey1,0x40($out)
12345cdd308eSdjm	jmp	.Lctr32_done
12355cdd308eSdjm
12365cdd308eSdjm.align	16
12375cdd308eSdjm.Lctr32_one_shortcut:
12385cdd308eSdjm	movups	($ivp),$inout0
12395cdd308eSdjm	movups	($inp),$in0
12405cdd308eSdjm	mov	240($key),$rounds		# key->rounds
12415cdd308eSdjm.Lctr32_one:
12425cdd308eSdjm___
12435cdd308eSdjm	&aesni_generate1("enc",$key,$rounds);
12445cdd308eSdjm$code.=<<___;
12455cdd308eSdjm	xorps	$inout0,$in0
12465cdd308eSdjm	movups	$in0,($out)
12475cdd308eSdjm	jmp	.Lctr32_done
12485cdd308eSdjm
12495cdd308eSdjm.align	16
12505cdd308eSdjm.Lctr32_two:
12515cdd308eSdjm	xorps	$inout2,$inout2
12525cdd308eSdjm	call	_aesni_encrypt3
12535cdd308eSdjm	xorps	$inout0,$in0
12545cdd308eSdjm	xorps	$inout1,$in1
12555cdd308eSdjm	movups	$in0,($out)
12565cdd308eSdjm	movups	$in1,0x10($out)
12575cdd308eSdjm	jmp	.Lctr32_done
12585cdd308eSdjm
12595cdd308eSdjm.align	16
12605cdd308eSdjm.Lctr32_three:
12615cdd308eSdjm	call	_aesni_encrypt3
12625cdd308eSdjm	xorps	$inout0,$in0
12635cdd308eSdjm	xorps	$inout1,$in1
12645cdd308eSdjm	movups	$in0,($out)
12655cdd308eSdjm	xorps	$inout2,$in2
12665cdd308eSdjm	movups	$in1,0x10($out)
12675cdd308eSdjm	movups	$in2,0x20($out)
12685cdd308eSdjm	jmp	.Lctr32_done
12695cdd308eSdjm
12705cdd308eSdjm.align	16
12715cdd308eSdjm.Lctr32_four:
12725cdd308eSdjm	call	_aesni_encrypt4
12735cdd308eSdjm	xorps	$inout0,$in0
12745cdd308eSdjm	xorps	$inout1,$in1
12755cdd308eSdjm	movups	$in0,($out)
12765cdd308eSdjm	xorps	$inout2,$in2
12775cdd308eSdjm	movups	$in1,0x10($out)
12785cdd308eSdjm	xorps	$inout3,$in3
12795cdd308eSdjm	movups	$in2,0x20($out)
12805cdd308eSdjm	movups	$in3,0x30($out)
12815cdd308eSdjm
12825cdd308eSdjm.Lctr32_done:
12835cdd308eSdjm___
12845cdd308eSdjm$code.=<<___ if ($win64);
12855cdd308eSdjm	movaps	0x20(%rsp),%xmm6
12865cdd308eSdjm	movaps	0x30(%rsp),%xmm7
12875cdd308eSdjm	movaps	0x40(%rsp),%xmm8
12885cdd308eSdjm	movaps	0x50(%rsp),%xmm9
12895cdd308eSdjm	movaps	0x60(%rsp),%xmm10
12905cdd308eSdjm	movaps	0x70(%rsp),%xmm11
12915cdd308eSdjm	movaps	0x80(%rsp),%xmm12
12925cdd308eSdjm	movaps	0x90(%rsp),%xmm13
12935cdd308eSdjm	movaps	0xa0(%rsp),%xmm14
12945cdd308eSdjm	movaps	0xb0(%rsp),%xmm15
12955cdd308eSdjm___
12965cdd308eSdjm$code.=<<___;
12975c104365Sjsing	lea	(%rbp),%rsp
12985c104365Sjsing	pop	%rbp
12995c104365Sjsing.Lctr32_ret:
13005cdd308eSdjm	ret
13015cdd308eSdjm.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
13025cdd308eSdjm___
13036249468aSthib}
13046249468aSthib
13055cdd308eSdjm######################################################################
13065cdd308eSdjm# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
13075cdd308eSdjm#	const AES_KEY *key1, const AES_KEY *key2
13085cdd308eSdjm#	const unsigned char iv[16]);
13095cdd308eSdjm#
13105cdd308eSdjm{
13115cdd308eSdjmmy @tweak=map("%xmm$_",(10..15));
13125cdd308eSdjmmy ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
13135cdd308eSdjmmy ($key2,$ivp,$len_)=("%r8","%r9","%r9");
13145c104365Sjsingmy $frame_size = 0x60 + ($win64?160:0);
13155cdd308eSdjm
13165cdd308eSdjm$code.=<<___;
13175cdd308eSdjm.globl	aesni_xts_encrypt
13185cdd308eSdjm.type	aesni_xts_encrypt,\@function,6
13195cdd308eSdjm.align	16
13205cdd308eSdjmaesni_xts_encrypt:
1321*5caf18b2Stb	_CET_ENDBR
13225c104365Sjsing	lea	(%rsp),%rax
13235c104365Sjsing	push	%rbp
13245c104365Sjsing	sub	\$$frame_size,%rsp
13255cdd308eSdjm___
13265cdd308eSdjm$code.=<<___ if ($win64);
13275cdd308eSdjm	movaps	%xmm6,0x60(%rsp)
13285cdd308eSdjm	movaps	%xmm7,0x70(%rsp)
13295cdd308eSdjm	movaps	%xmm8,0x80(%rsp)
13305cdd308eSdjm	movaps	%xmm9,0x90(%rsp)
13315cdd308eSdjm	movaps	%xmm10,0xa0(%rsp)
13325cdd308eSdjm	movaps	%xmm11,0xb0(%rsp)
13335cdd308eSdjm	movaps	%xmm12,0xc0(%rsp)
13345cdd308eSdjm	movaps	%xmm13,0xd0(%rsp)
13355cdd308eSdjm	movaps	%xmm14,0xe0(%rsp)
13365cdd308eSdjm	movaps	%xmm15,0xf0(%rsp)
13375cdd308eSdjm.Lxts_enc_body:
13385cdd308eSdjm___
13395cdd308eSdjm$code.=<<___;
13405c104365Sjsing	lea	-8(%rax),%rbp
13415cdd308eSdjm	movups	($ivp),@tweak[5]		# load clear-text tweak
13425cdd308eSdjm	mov	240(%r8),$rounds		# key2->rounds
13435cdd308eSdjm	mov	240($key),$rnds_		# key1->rounds
13445cdd308eSdjm___
13455cdd308eSdjm	# generate the tweak
13465cdd308eSdjm	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
13475cdd308eSdjm$code.=<<___;
13485cdd308eSdjm	mov	$key,$key_			# backup $key
13495cdd308eSdjm	mov	$rnds_,$rounds			# backup $rounds
13505cdd308eSdjm	mov	$len,$len_			# backup $len
13515cdd308eSdjm	and	\$-16,$len
13525cdd308eSdjm
13535cdd308eSdjm	movdqa	.Lxts_magic(%rip),$twmask
13545cdd308eSdjm	pxor	$twtmp,$twtmp
13555cdd308eSdjm	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
13565cdd308eSdjm___
13575cdd308eSdjm    for ($i=0;$i<4;$i++) {
13585cdd308eSdjm    $code.=<<___;
13595cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
13605cdd308eSdjm	pxor	$twtmp,$twtmp
13615cdd308eSdjm	movdqa	@tweak[5],@tweak[$i]
13625cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
13635cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
136471743258Sjmc	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
13655cdd308eSdjm	pxor	$twres,@tweak[5]
13665cdd308eSdjm___
13675cdd308eSdjm    }
13685cdd308eSdjm$code.=<<___;
13695cdd308eSdjm	sub	\$16*6,$len
13705cdd308eSdjm	jc	.Lxts_enc_short
13715cdd308eSdjm
13725cdd308eSdjm	shr	\$1,$rounds
13735cdd308eSdjm	sub	\$1,$rounds
13745cdd308eSdjm	mov	$rounds,$rnds_
13755cdd308eSdjm	jmp	.Lxts_enc_grandloop
13765cdd308eSdjm
13775cdd308eSdjm.align	16
13785cdd308eSdjm.Lxts_enc_grandloop:
13795cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
13805cdd308eSdjm	movdqa	@tweak[5],@tweak[4]
13815cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
13825cdd308eSdjm	movdqu	`16*0`($inp),$inout0		# load input
13835cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
13845cdd308eSdjm	movdqu	`16*1`($inp),$inout1
13855cdd308eSdjm	pxor	$twres,@tweak[5]
13865cdd308eSdjm
13875cdd308eSdjm	movdqu	`16*2`($inp),$inout2
13885cdd308eSdjm	pxor	@tweak[0],$inout0		# input^=tweak
13895cdd308eSdjm	movdqu	`16*3`($inp),$inout3
13905cdd308eSdjm	pxor	@tweak[1],$inout1
13915cdd308eSdjm	movdqu	`16*4`($inp),$inout4
13925cdd308eSdjm	pxor	@tweak[2],$inout2
13935cdd308eSdjm	movdqu	`16*5`($inp),$inout5
13945cdd308eSdjm	lea	`16*6`($inp),$inp
13955cdd308eSdjm	pxor	@tweak[3],$inout3
13965cdd308eSdjm	$movkey		($key_),$rndkey0
13975cdd308eSdjm	pxor	@tweak[4],$inout4
13985cdd308eSdjm	pxor	@tweak[5],$inout5
13995cdd308eSdjm
14005cdd308eSdjm	# inline _aesni_encrypt6 and interleave first and last rounds
14015cdd308eSdjm	# with own code...
14025cdd308eSdjm	$movkey		16($key_),$rndkey1
14035cdd308eSdjm	pxor		$rndkey0,$inout0
14045cdd308eSdjm	pxor		$rndkey0,$inout1
14055cdd308eSdjm	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
14065cdd308eSdjm	aesenc		$rndkey1,$inout0
14075cdd308eSdjm	lea		32($key_),$key
14085cdd308eSdjm	pxor		$rndkey0,$inout2
14095cdd308eSdjm	 movdqa	@tweak[1],`16*1`(%rsp)
14105cdd308eSdjm	aesenc		$rndkey1,$inout1
14115cdd308eSdjm	pxor		$rndkey0,$inout3
14125cdd308eSdjm	 movdqa	@tweak[2],`16*2`(%rsp)
14135cdd308eSdjm	aesenc		$rndkey1,$inout2
14145cdd308eSdjm	pxor		$rndkey0,$inout4
14155cdd308eSdjm	 movdqa	@tweak[3],`16*3`(%rsp)
14165cdd308eSdjm	aesenc		$rndkey1,$inout3
14175cdd308eSdjm	pxor		$rndkey0,$inout5
14185cdd308eSdjm	$movkey		($key),$rndkey0
14195cdd308eSdjm	dec		$rounds
14205cdd308eSdjm	 movdqa	@tweak[4],`16*4`(%rsp)
14215cdd308eSdjm	aesenc		$rndkey1,$inout4
14225cdd308eSdjm	 movdqa	@tweak[5],`16*5`(%rsp)
14235cdd308eSdjm	aesenc		$rndkey1,$inout5
14245cdd308eSdjm	pxor	$twtmp,$twtmp
14255cdd308eSdjm	pcmpgtd	@tweak[5],$twtmp
14265cdd308eSdjm	jmp		.Lxts_enc_loop6_enter
14275cdd308eSdjm
14285cdd308eSdjm.align	16
14295cdd308eSdjm.Lxts_enc_loop6:
14305cdd308eSdjm	aesenc		$rndkey1,$inout0
14315cdd308eSdjm	aesenc		$rndkey1,$inout1
14325cdd308eSdjm	dec		$rounds
14335cdd308eSdjm	aesenc		$rndkey1,$inout2
14345cdd308eSdjm	aesenc		$rndkey1,$inout3
14355cdd308eSdjm	aesenc		$rndkey1,$inout4
14365cdd308eSdjm	aesenc		$rndkey1,$inout5
14375cdd308eSdjm.Lxts_enc_loop6_enter:
14385cdd308eSdjm	$movkey		16($key),$rndkey1
14395cdd308eSdjm	aesenc		$rndkey0,$inout0
14405cdd308eSdjm	aesenc		$rndkey0,$inout1
14415cdd308eSdjm	lea		32($key),$key
14425cdd308eSdjm	aesenc		$rndkey0,$inout2
14435cdd308eSdjm	aesenc		$rndkey0,$inout3
14445cdd308eSdjm	aesenc		$rndkey0,$inout4
14455cdd308eSdjm	aesenc		$rndkey0,$inout5
14465cdd308eSdjm	$movkey		($key),$rndkey0
14475cdd308eSdjm	jnz		.Lxts_enc_loop6
14485cdd308eSdjm
14495cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
14505cdd308eSdjm	pxor	$twtmp,$twtmp
14515cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
14525cdd308eSdjm	 aesenc		$rndkey1,$inout0
14535cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
14545cdd308eSdjm	 aesenc		$rndkey1,$inout1
14555cdd308eSdjm	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
14565cdd308eSdjm	 aesenc		$rndkey1,$inout2
14575cdd308eSdjm	pxor	$twres,@tweak[5]
14585cdd308eSdjm	 aesenc		$rndkey1,$inout3
14595cdd308eSdjm	 aesenc		$rndkey1,$inout4
14605cdd308eSdjm	 aesenc		$rndkey1,$inout5
14615cdd308eSdjm	 $movkey	16($key),$rndkey1
14625cdd308eSdjm
14635cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
14645cdd308eSdjm	pxor	$twtmp,$twtmp
14655cdd308eSdjm	movdqa	@tweak[5],@tweak[0]
14665cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
14675cdd308eSdjm	 aesenc		$rndkey0,$inout0
14685cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
14695cdd308eSdjm	 aesenc		$rndkey0,$inout1
147071743258Sjmc	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
14715cdd308eSdjm	 aesenc		$rndkey0,$inout2
14725cdd308eSdjm	pxor	$twres,@tweak[5]
14735cdd308eSdjm	 aesenc		$rndkey0,$inout3
14745cdd308eSdjm	 aesenc		$rndkey0,$inout4
14755cdd308eSdjm	 aesenc		$rndkey0,$inout5
14765cdd308eSdjm	 $movkey	32($key),$rndkey0
14775cdd308eSdjm
14785cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
14795cdd308eSdjm	pxor	$twtmp,$twtmp
14805cdd308eSdjm	movdqa	@tweak[5],@tweak[1]
14815cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
14825cdd308eSdjm	 aesenc		$rndkey1,$inout0
14835cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
14845cdd308eSdjm	 aesenc		$rndkey1,$inout1
148571743258Sjmc	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
14865cdd308eSdjm	 aesenc		$rndkey1,$inout2
14875cdd308eSdjm	pxor	$twres,@tweak[5]
14885cdd308eSdjm	 aesenc		$rndkey1,$inout3
14895cdd308eSdjm	 aesenc		$rndkey1,$inout4
14905cdd308eSdjm	 aesenc		$rndkey1,$inout5
14915cdd308eSdjm
14925cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
14935cdd308eSdjm	pxor	$twtmp,$twtmp
14945cdd308eSdjm	movdqa	@tweak[5],@tweak[2]
14955cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
14965cdd308eSdjm	 aesenclast	$rndkey0,$inout0
14975cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
14985cdd308eSdjm	 aesenclast	$rndkey0,$inout1
149971743258Sjmc	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
15005cdd308eSdjm	 aesenclast	$rndkey0,$inout2
15015cdd308eSdjm	pxor	$twres,@tweak[5]
15025cdd308eSdjm	 aesenclast	$rndkey0,$inout3
15035cdd308eSdjm	 aesenclast	$rndkey0,$inout4
15045cdd308eSdjm	 aesenclast	$rndkey0,$inout5
15055cdd308eSdjm
15065cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
15075cdd308eSdjm	pxor	$twtmp,$twtmp
15085cdd308eSdjm	movdqa	@tweak[5],@tweak[3]
15095cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
15105cdd308eSdjm	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
15115cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
15125cdd308eSdjm	 xorps	`16*1`(%rsp),$inout1
151371743258Sjmc	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
15145cdd308eSdjm	pxor	$twres,@tweak[5]
15155cdd308eSdjm
15165cdd308eSdjm	xorps	`16*2`(%rsp),$inout2
15175cdd308eSdjm	movups	$inout0,`16*0`($out)		# write output
15185cdd308eSdjm	xorps	`16*3`(%rsp),$inout3
15195cdd308eSdjm	movups	$inout1,`16*1`($out)
15205cdd308eSdjm	xorps	`16*4`(%rsp),$inout4
15215cdd308eSdjm	movups	$inout2,`16*2`($out)
15225cdd308eSdjm	xorps	`16*5`(%rsp),$inout5
15235cdd308eSdjm	movups	$inout3,`16*3`($out)
15245cdd308eSdjm	mov	$rnds_,$rounds			# restore $rounds
15255cdd308eSdjm	movups	$inout4,`16*4`($out)
15265cdd308eSdjm	movups	$inout5,`16*5`($out)
15275cdd308eSdjm	lea	`16*6`($out),$out
15285cdd308eSdjm	sub	\$16*6,$len
15295cdd308eSdjm	jnc	.Lxts_enc_grandloop
15305cdd308eSdjm
15315cdd308eSdjm	lea	3($rounds,$rounds),$rounds	# restore original value
15325cdd308eSdjm	mov	$key_,$key			# restore $key
15335cdd308eSdjm	mov	$rounds,$rnds_			# backup $rounds
15345cdd308eSdjm
15355cdd308eSdjm.Lxts_enc_short:
15365cdd308eSdjm	add	\$16*6,$len
15375cdd308eSdjm	jz	.Lxts_enc_done
15385cdd308eSdjm
15395cdd308eSdjm	cmp	\$0x20,$len
15405cdd308eSdjm	jb	.Lxts_enc_one
15415cdd308eSdjm	je	.Lxts_enc_two
15425cdd308eSdjm
15435cdd308eSdjm	cmp	\$0x40,$len
15445cdd308eSdjm	jb	.Lxts_enc_three
15455cdd308eSdjm	je	.Lxts_enc_four
15465cdd308eSdjm
15475cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
15485cdd308eSdjm	movdqa	@tweak[5],@tweak[4]
15495cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
15505cdd308eSdjm	 movdqu	($inp),$inout0
15515cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
15525cdd308eSdjm	 movdqu	16*1($inp),$inout1
15535cdd308eSdjm	pxor	$twres,@tweak[5]
15545cdd308eSdjm
15555cdd308eSdjm	movdqu	16*2($inp),$inout2
15565cdd308eSdjm	pxor	@tweak[0],$inout0
15575cdd308eSdjm	movdqu	16*3($inp),$inout3
15585cdd308eSdjm	pxor	@tweak[1],$inout1
15595cdd308eSdjm	movdqu	16*4($inp),$inout4
15605cdd308eSdjm	lea	16*5($inp),$inp
15615cdd308eSdjm	pxor	@tweak[2],$inout2
15625cdd308eSdjm	pxor	@tweak[3],$inout3
15635cdd308eSdjm	pxor	@tweak[4],$inout4
15645cdd308eSdjm
15655cdd308eSdjm	call	_aesni_encrypt6
15665cdd308eSdjm
15675cdd308eSdjm	xorps	@tweak[0],$inout0
15685cdd308eSdjm	movdqa	@tweak[5],@tweak[0]
15695cdd308eSdjm	xorps	@tweak[1],$inout1
15705cdd308eSdjm	xorps	@tweak[2],$inout2
15715cdd308eSdjm	movdqu	$inout0,($out)
15725cdd308eSdjm	xorps	@tweak[3],$inout3
15735cdd308eSdjm	movdqu	$inout1,16*1($out)
15745cdd308eSdjm	xorps	@tweak[4],$inout4
15755cdd308eSdjm	movdqu	$inout2,16*2($out)
15765cdd308eSdjm	movdqu	$inout3,16*3($out)
15775cdd308eSdjm	movdqu	$inout4,16*4($out)
15785cdd308eSdjm	lea	16*5($out),$out
15795cdd308eSdjm	jmp	.Lxts_enc_done
15805cdd308eSdjm
15815cdd308eSdjm.align	16
15825cdd308eSdjm.Lxts_enc_one:
15835cdd308eSdjm	movups	($inp),$inout0
15845cdd308eSdjm	lea	16*1($inp),$inp
15855cdd308eSdjm	xorps	@tweak[0],$inout0
15865cdd308eSdjm___
15875cdd308eSdjm	&aesni_generate1("enc",$key,$rounds);
15885cdd308eSdjm$code.=<<___;
15895cdd308eSdjm	xorps	@tweak[0],$inout0
15905cdd308eSdjm	movdqa	@tweak[1],@tweak[0]
15915cdd308eSdjm	movups	$inout0,($out)
15925cdd308eSdjm	lea	16*1($out),$out
15935cdd308eSdjm	jmp	.Lxts_enc_done
15945cdd308eSdjm
15955cdd308eSdjm.align	16
15965cdd308eSdjm.Lxts_enc_two:
15975cdd308eSdjm	movups	($inp),$inout0
15985cdd308eSdjm	movups	16($inp),$inout1
15995cdd308eSdjm	lea	32($inp),$inp
16005cdd308eSdjm	xorps	@tweak[0],$inout0
16015cdd308eSdjm	xorps	@tweak[1],$inout1
16025cdd308eSdjm
16035cdd308eSdjm	call	_aesni_encrypt3
16045cdd308eSdjm
16055cdd308eSdjm	xorps	@tweak[0],$inout0
16065cdd308eSdjm	movdqa	@tweak[2],@tweak[0]
16075cdd308eSdjm	xorps	@tweak[1],$inout1
16085cdd308eSdjm	movups	$inout0,($out)
16095cdd308eSdjm	movups	$inout1,16*1($out)
16105cdd308eSdjm	lea	16*2($out),$out
16115cdd308eSdjm	jmp	.Lxts_enc_done
16125cdd308eSdjm
16135cdd308eSdjm.align	16
16145cdd308eSdjm.Lxts_enc_three:
16155cdd308eSdjm	movups	($inp),$inout0
16165cdd308eSdjm	movups	16*1($inp),$inout1
16175cdd308eSdjm	movups	16*2($inp),$inout2
16185cdd308eSdjm	lea	16*3($inp),$inp
16195cdd308eSdjm	xorps	@tweak[0],$inout0
16205cdd308eSdjm	xorps	@tweak[1],$inout1
16215cdd308eSdjm	xorps	@tweak[2],$inout2
16225cdd308eSdjm
16235cdd308eSdjm	call	_aesni_encrypt3
16245cdd308eSdjm
16255cdd308eSdjm	xorps	@tweak[0],$inout0
16265cdd308eSdjm	movdqa	@tweak[3],@tweak[0]
16275cdd308eSdjm	xorps	@tweak[1],$inout1
16285cdd308eSdjm	xorps	@tweak[2],$inout2
16295cdd308eSdjm	movups	$inout0,($out)
16305cdd308eSdjm	movups	$inout1,16*1($out)
16315cdd308eSdjm	movups	$inout2,16*2($out)
16325cdd308eSdjm	lea	16*3($out),$out
16335cdd308eSdjm	jmp	.Lxts_enc_done
16345cdd308eSdjm
16355cdd308eSdjm.align	16
16365cdd308eSdjm.Lxts_enc_four:
16375cdd308eSdjm	movups	($inp),$inout0
16385cdd308eSdjm	movups	16*1($inp),$inout1
16395cdd308eSdjm	movups	16*2($inp),$inout2
16405cdd308eSdjm	xorps	@tweak[0],$inout0
16415cdd308eSdjm	movups	16*3($inp),$inout3
16425cdd308eSdjm	lea	16*4($inp),$inp
16435cdd308eSdjm	xorps	@tweak[1],$inout1
16445cdd308eSdjm	xorps	@tweak[2],$inout2
16455cdd308eSdjm	xorps	@tweak[3],$inout3
16465cdd308eSdjm
16475cdd308eSdjm	call	_aesni_encrypt4
16485cdd308eSdjm
16495cdd308eSdjm	xorps	@tweak[0],$inout0
16505cdd308eSdjm	movdqa	@tweak[5],@tweak[0]
16515cdd308eSdjm	xorps	@tweak[1],$inout1
16525cdd308eSdjm	xorps	@tweak[2],$inout2
16535cdd308eSdjm	movups	$inout0,($out)
16545cdd308eSdjm	xorps	@tweak[3],$inout3
16555cdd308eSdjm	movups	$inout1,16*1($out)
16565cdd308eSdjm	movups	$inout2,16*2($out)
16575cdd308eSdjm	movups	$inout3,16*3($out)
16585cdd308eSdjm	lea	16*4($out),$out
16595cdd308eSdjm	jmp	.Lxts_enc_done
16605cdd308eSdjm
16615cdd308eSdjm.align	16
16625cdd308eSdjm.Lxts_enc_done:
16635cdd308eSdjm	and	\$15,$len_
16645cdd308eSdjm	jz	.Lxts_enc_ret
16655cdd308eSdjm	mov	$len_,$len
16665cdd308eSdjm
16675cdd308eSdjm.Lxts_enc_steal:
16685cdd308eSdjm	movzb	($inp),%eax			# borrow $rounds ...
16695cdd308eSdjm	movzb	-16($out),%ecx			# ... and $key
16705cdd308eSdjm	lea	1($inp),$inp
16715cdd308eSdjm	mov	%al,-16($out)
16725cdd308eSdjm	mov	%cl,0($out)
16735cdd308eSdjm	lea	1($out),$out
16745cdd308eSdjm	sub	\$1,$len
16755cdd308eSdjm	jnz	.Lxts_enc_steal
16765cdd308eSdjm
16775cdd308eSdjm	sub	$len_,$out			# rewind $out
16785cdd308eSdjm	mov	$key_,$key			# restore $key
16795cdd308eSdjm	mov	$rnds_,$rounds			# restore $rounds
16805cdd308eSdjm
16815cdd308eSdjm	movups	-16($out),$inout0
16825cdd308eSdjm	xorps	@tweak[0],$inout0
16835cdd308eSdjm___
16845cdd308eSdjm	&aesni_generate1("enc",$key,$rounds);
16855cdd308eSdjm$code.=<<___;
16865cdd308eSdjm	xorps	@tweak[0],$inout0
16875cdd308eSdjm	movups	$inout0,-16($out)
16885cdd308eSdjm
16895cdd308eSdjm.Lxts_enc_ret:
16905cdd308eSdjm___
16915cdd308eSdjm$code.=<<___ if ($win64);
16925cdd308eSdjm	movaps	0x60(%rsp),%xmm6
16935cdd308eSdjm	movaps	0x70(%rsp),%xmm7
16945cdd308eSdjm	movaps	0x80(%rsp),%xmm8
16955cdd308eSdjm	movaps	0x90(%rsp),%xmm9
16965cdd308eSdjm	movaps	0xa0(%rsp),%xmm10
16975cdd308eSdjm	movaps	0xb0(%rsp),%xmm11
16985cdd308eSdjm	movaps	0xc0(%rsp),%xmm12
16995cdd308eSdjm	movaps	0xd0(%rsp),%xmm13
17005cdd308eSdjm	movaps	0xe0(%rsp),%xmm14
17015cdd308eSdjm	movaps	0xf0(%rsp),%xmm15
17025cdd308eSdjm___
17035cdd308eSdjm$code.=<<___;
17045c104365Sjsing	lea	(%rbp),%rsp
17055c104365Sjsing	pop	%rbp
17065cdd308eSdjm.Lxts_enc_epilogue:
17075cdd308eSdjm	ret
17085cdd308eSdjm.size	aesni_xts_encrypt,.-aesni_xts_encrypt
17095cdd308eSdjm___
17105cdd308eSdjm
17115cdd308eSdjm$code.=<<___;
17125cdd308eSdjm.globl	aesni_xts_decrypt
17135cdd308eSdjm.type	aesni_xts_decrypt,\@function,6
17145cdd308eSdjm.align	16
17155cdd308eSdjmaesni_xts_decrypt:
1716*5caf18b2Stb	_CET_ENDBR
17175c104365Sjsing	lea	(%rsp),%rax
17185c104365Sjsing	push	%rbp
17195c104365Sjsing	sub	\$$frame_size,%rsp
17205cdd308eSdjm___
17215cdd308eSdjm$code.=<<___ if ($win64);
17225cdd308eSdjm	movaps	%xmm6,0x60(%rsp)
17235cdd308eSdjm	movaps	%xmm7,0x70(%rsp)
17245cdd308eSdjm	movaps	%xmm8,0x80(%rsp)
17255cdd308eSdjm	movaps	%xmm9,0x90(%rsp)
17265cdd308eSdjm	movaps	%xmm10,0xa0(%rsp)
17275cdd308eSdjm	movaps	%xmm11,0xb0(%rsp)
17285cdd308eSdjm	movaps	%xmm12,0xc0(%rsp)
17295cdd308eSdjm	movaps	%xmm13,0xd0(%rsp)
17305cdd308eSdjm	movaps	%xmm14,0xe0(%rsp)
17315cdd308eSdjm	movaps	%xmm15,0xf0(%rsp)
17325cdd308eSdjm.Lxts_dec_body:
17335cdd308eSdjm___
17345cdd308eSdjm$code.=<<___;
17355c104365Sjsing	lea	-8(%rax),%rbp
17365cdd308eSdjm	movups	($ivp),@tweak[5]		# load clear-text tweak
17375cdd308eSdjm	mov	240($key2),$rounds		# key2->rounds
17385cdd308eSdjm	mov	240($key),$rnds_		# key1->rounds
17395cdd308eSdjm___
17405cdd308eSdjm	# generate the tweak
17415cdd308eSdjm	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
17425cdd308eSdjm$code.=<<___;
17435cdd308eSdjm	xor	%eax,%eax			# if ($len%16) len-=16;
17445cdd308eSdjm	test	\$15,$len
17455cdd308eSdjm	setnz	%al
17465cdd308eSdjm	shl	\$4,%rax
17475cdd308eSdjm	sub	%rax,$len
17485cdd308eSdjm
17495cdd308eSdjm	mov	$key,$key_			# backup $key
17505cdd308eSdjm	mov	$rnds_,$rounds			# backup $rounds
17515cdd308eSdjm	mov	$len,$len_			# backup $len
17525cdd308eSdjm	and	\$-16,$len
17535cdd308eSdjm
17545cdd308eSdjm	movdqa	.Lxts_magic(%rip),$twmask
17555cdd308eSdjm	pxor	$twtmp,$twtmp
17565cdd308eSdjm	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
17575cdd308eSdjm___
17585cdd308eSdjm    for ($i=0;$i<4;$i++) {
17595cdd308eSdjm    $code.=<<___;
17605cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
17615cdd308eSdjm	pxor	$twtmp,$twtmp
17625cdd308eSdjm	movdqa	@tweak[5],@tweak[$i]
17635cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
17645cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
176571743258Sjmc	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
17665cdd308eSdjm	pxor	$twres,@tweak[5]
17675cdd308eSdjm___
17685cdd308eSdjm    }
17695cdd308eSdjm$code.=<<___;
17705cdd308eSdjm	sub	\$16*6,$len
17715cdd308eSdjm	jc	.Lxts_dec_short
17725cdd308eSdjm
17735cdd308eSdjm	shr	\$1,$rounds
17745cdd308eSdjm	sub	\$1,$rounds
17755cdd308eSdjm	mov	$rounds,$rnds_
17765cdd308eSdjm	jmp	.Lxts_dec_grandloop
17775cdd308eSdjm
17785cdd308eSdjm.align	16
17795cdd308eSdjm.Lxts_dec_grandloop:
17805cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
17815cdd308eSdjm	movdqa	@tweak[5],@tweak[4]
17825cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
17835cdd308eSdjm	movdqu	`16*0`($inp),$inout0		# load input
17845cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
17855cdd308eSdjm	movdqu	`16*1`($inp),$inout1
17865cdd308eSdjm	pxor	$twres,@tweak[5]
17875cdd308eSdjm
17885cdd308eSdjm	movdqu	`16*2`($inp),$inout2
17895cdd308eSdjm	pxor	@tweak[0],$inout0		# input^=tweak
17905cdd308eSdjm	movdqu	`16*3`($inp),$inout3
17915cdd308eSdjm	pxor	@tweak[1],$inout1
17925cdd308eSdjm	movdqu	`16*4`($inp),$inout4
17935cdd308eSdjm	pxor	@tweak[2],$inout2
17945cdd308eSdjm	movdqu	`16*5`($inp),$inout5
17955cdd308eSdjm	lea	`16*6`($inp),$inp
17965cdd308eSdjm	pxor	@tweak[3],$inout3
17975cdd308eSdjm	$movkey		($key_),$rndkey0
17985cdd308eSdjm	pxor	@tweak[4],$inout4
17995cdd308eSdjm	pxor	@tweak[5],$inout5
18005cdd308eSdjm
18015cdd308eSdjm	# inline _aesni_decrypt6 and interleave first and last rounds
18025cdd308eSdjm	# with own code...
18035cdd308eSdjm	$movkey		16($key_),$rndkey1
18045cdd308eSdjm	pxor		$rndkey0,$inout0
18055cdd308eSdjm	pxor		$rndkey0,$inout1
18065cdd308eSdjm	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
18075cdd308eSdjm	aesdec		$rndkey1,$inout0
18085cdd308eSdjm	lea		32($key_),$key
18095cdd308eSdjm	pxor		$rndkey0,$inout2
18105cdd308eSdjm	 movdqa	@tweak[1],`16*1`(%rsp)
18115cdd308eSdjm	aesdec		$rndkey1,$inout1
18125cdd308eSdjm	pxor		$rndkey0,$inout3
18135cdd308eSdjm	 movdqa	@tweak[2],`16*2`(%rsp)
18145cdd308eSdjm	aesdec		$rndkey1,$inout2
18155cdd308eSdjm	pxor		$rndkey0,$inout4
18165cdd308eSdjm	 movdqa	@tweak[3],`16*3`(%rsp)
18175cdd308eSdjm	aesdec		$rndkey1,$inout3
18185cdd308eSdjm	pxor		$rndkey0,$inout5
18195cdd308eSdjm	$movkey		($key),$rndkey0
18205cdd308eSdjm	dec		$rounds
18215cdd308eSdjm	 movdqa	@tweak[4],`16*4`(%rsp)
18225cdd308eSdjm	aesdec		$rndkey1,$inout4
18235cdd308eSdjm	 movdqa	@tweak[5],`16*5`(%rsp)
18245cdd308eSdjm	aesdec		$rndkey1,$inout5
18255cdd308eSdjm	pxor	$twtmp,$twtmp
18265cdd308eSdjm	pcmpgtd	@tweak[5],$twtmp
18275cdd308eSdjm	jmp		.Lxts_dec_loop6_enter
18285cdd308eSdjm
18295cdd308eSdjm.align	16
18305cdd308eSdjm.Lxts_dec_loop6:
18315cdd308eSdjm	aesdec		$rndkey1,$inout0
18325cdd308eSdjm	aesdec		$rndkey1,$inout1
18335cdd308eSdjm	dec		$rounds
18345cdd308eSdjm	aesdec		$rndkey1,$inout2
18355cdd308eSdjm	aesdec		$rndkey1,$inout3
18365cdd308eSdjm	aesdec		$rndkey1,$inout4
18375cdd308eSdjm	aesdec		$rndkey1,$inout5
18385cdd308eSdjm.Lxts_dec_loop6_enter:
18395cdd308eSdjm	$movkey		16($key),$rndkey1
18405cdd308eSdjm	aesdec		$rndkey0,$inout0
18415cdd308eSdjm	aesdec		$rndkey0,$inout1
18425cdd308eSdjm	lea		32($key),$key
18435cdd308eSdjm	aesdec		$rndkey0,$inout2
18445cdd308eSdjm	aesdec		$rndkey0,$inout3
18455cdd308eSdjm	aesdec		$rndkey0,$inout4
18465cdd308eSdjm	aesdec		$rndkey0,$inout5
18475cdd308eSdjm	$movkey		($key),$rndkey0
18485cdd308eSdjm	jnz		.Lxts_dec_loop6
18495cdd308eSdjm
18505cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
18515cdd308eSdjm	pxor	$twtmp,$twtmp
18525cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
18535cdd308eSdjm	 aesdec		$rndkey1,$inout0
18545cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
18555cdd308eSdjm	 aesdec		$rndkey1,$inout1
18565cdd308eSdjm	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
18575cdd308eSdjm	 aesdec		$rndkey1,$inout2
18585cdd308eSdjm	pxor	$twres,@tweak[5]
18595cdd308eSdjm	 aesdec		$rndkey1,$inout3
18605cdd308eSdjm	 aesdec		$rndkey1,$inout4
18615cdd308eSdjm	 aesdec		$rndkey1,$inout5
18625cdd308eSdjm	 $movkey	16($key),$rndkey1
18635cdd308eSdjm
18645cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
18655cdd308eSdjm	pxor	$twtmp,$twtmp
18665cdd308eSdjm	movdqa	@tweak[5],@tweak[0]
18675cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
18685cdd308eSdjm	 aesdec		$rndkey0,$inout0
18695cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
18705cdd308eSdjm	 aesdec		$rndkey0,$inout1
187171743258Sjmc	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
18725cdd308eSdjm	 aesdec		$rndkey0,$inout2
18735cdd308eSdjm	pxor	$twres,@tweak[5]
18745cdd308eSdjm	 aesdec		$rndkey0,$inout3
18755cdd308eSdjm	 aesdec		$rndkey0,$inout4
18765cdd308eSdjm	 aesdec		$rndkey0,$inout5
18775cdd308eSdjm	 $movkey	32($key),$rndkey0
18785cdd308eSdjm
18795cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
18805cdd308eSdjm	pxor	$twtmp,$twtmp
18815cdd308eSdjm	movdqa	@tweak[5],@tweak[1]
18825cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
18835cdd308eSdjm	 aesdec		$rndkey1,$inout0
18845cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
18855cdd308eSdjm	 aesdec		$rndkey1,$inout1
188671743258Sjmc	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
18875cdd308eSdjm	 aesdec		$rndkey1,$inout2
18885cdd308eSdjm	pxor	$twres,@tweak[5]
18895cdd308eSdjm	 aesdec		$rndkey1,$inout3
18905cdd308eSdjm	 aesdec		$rndkey1,$inout4
18915cdd308eSdjm	 aesdec		$rndkey1,$inout5
18925cdd308eSdjm
18935cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
18945cdd308eSdjm	pxor	$twtmp,$twtmp
18955cdd308eSdjm	movdqa	@tweak[5],@tweak[2]
18965cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
18975cdd308eSdjm	 aesdeclast	$rndkey0,$inout0
18985cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
18995cdd308eSdjm	 aesdeclast	$rndkey0,$inout1
190071743258Sjmc	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
19015cdd308eSdjm	 aesdeclast	$rndkey0,$inout2
19025cdd308eSdjm	pxor	$twres,@tweak[5]
19035cdd308eSdjm	 aesdeclast	$rndkey0,$inout3
19045cdd308eSdjm	 aesdeclast	$rndkey0,$inout4
19055cdd308eSdjm	 aesdeclast	$rndkey0,$inout5
19065cdd308eSdjm
19075cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
19085cdd308eSdjm	pxor	$twtmp,$twtmp
19095cdd308eSdjm	movdqa	@tweak[5],@tweak[3]
19105cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
19115cdd308eSdjm	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
19125cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
19135cdd308eSdjm	 xorps	`16*1`(%rsp),$inout1
191471743258Sjmc	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
19155cdd308eSdjm	pxor	$twres,@tweak[5]
19165cdd308eSdjm
19175cdd308eSdjm	xorps	`16*2`(%rsp),$inout2
19185cdd308eSdjm	movups	$inout0,`16*0`($out)		# write output
19195cdd308eSdjm	xorps	`16*3`(%rsp),$inout3
19205cdd308eSdjm	movups	$inout1,`16*1`($out)
19215cdd308eSdjm	xorps	`16*4`(%rsp),$inout4
19225cdd308eSdjm	movups	$inout2,`16*2`($out)
19235cdd308eSdjm	xorps	`16*5`(%rsp),$inout5
19245cdd308eSdjm	movups	$inout3,`16*3`($out)
19255cdd308eSdjm	mov	$rnds_,$rounds			# restore $rounds
19265cdd308eSdjm	movups	$inout4,`16*4`($out)
19275cdd308eSdjm	movups	$inout5,`16*5`($out)
19285cdd308eSdjm	lea	`16*6`($out),$out
19295cdd308eSdjm	sub	\$16*6,$len
19305cdd308eSdjm	jnc	.Lxts_dec_grandloop
19315cdd308eSdjm
19325cdd308eSdjm	lea	3($rounds,$rounds),$rounds	# restore original value
19335cdd308eSdjm	mov	$key_,$key			# restore $key
19345cdd308eSdjm	mov	$rounds,$rnds_			# backup $rounds
19355cdd308eSdjm
19365cdd308eSdjm.Lxts_dec_short:
19375cdd308eSdjm	add	\$16*6,$len
19385cdd308eSdjm	jz	.Lxts_dec_done
19395cdd308eSdjm
19405cdd308eSdjm	cmp	\$0x20,$len
19415cdd308eSdjm	jb	.Lxts_dec_one
19425cdd308eSdjm	je	.Lxts_dec_two
19435cdd308eSdjm
19445cdd308eSdjm	cmp	\$0x40,$len
19455cdd308eSdjm	jb	.Lxts_dec_three
19465cdd308eSdjm	je	.Lxts_dec_four
19475cdd308eSdjm
19485cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
19495cdd308eSdjm	movdqa	@tweak[5],@tweak[4]
19505cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
19515cdd308eSdjm	 movdqu	($inp),$inout0
19525cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
19535cdd308eSdjm	 movdqu	16*1($inp),$inout1
19545cdd308eSdjm	pxor	$twres,@tweak[5]
19555cdd308eSdjm
19565cdd308eSdjm	movdqu	16*2($inp),$inout2
19575cdd308eSdjm	pxor	@tweak[0],$inout0
19585cdd308eSdjm	movdqu	16*3($inp),$inout3
19595cdd308eSdjm	pxor	@tweak[1],$inout1
19605cdd308eSdjm	movdqu	16*4($inp),$inout4
19615cdd308eSdjm	lea	16*5($inp),$inp
19625cdd308eSdjm	pxor	@tweak[2],$inout2
19635cdd308eSdjm	pxor	@tweak[3],$inout3
19645cdd308eSdjm	pxor	@tweak[4],$inout4
19655cdd308eSdjm
19665cdd308eSdjm	call	_aesni_decrypt6
19675cdd308eSdjm
19685cdd308eSdjm	xorps	@tweak[0],$inout0
19695cdd308eSdjm	xorps	@tweak[1],$inout1
19705cdd308eSdjm	xorps	@tweak[2],$inout2
19715cdd308eSdjm	movdqu	$inout0,($out)
19725cdd308eSdjm	xorps	@tweak[3],$inout3
19735cdd308eSdjm	movdqu	$inout1,16*1($out)
19745cdd308eSdjm	xorps	@tweak[4],$inout4
19755cdd308eSdjm	movdqu	$inout2,16*2($out)
19765cdd308eSdjm	 pxor		$twtmp,$twtmp
19775cdd308eSdjm	movdqu	$inout3,16*3($out)
19785cdd308eSdjm	 pcmpgtd	@tweak[5],$twtmp
19795cdd308eSdjm	movdqu	$inout4,16*4($out)
19805cdd308eSdjm	lea	16*5($out),$out
19815cdd308eSdjm	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
19825cdd308eSdjm	and	\$15,$len_
19835cdd308eSdjm	jz	.Lxts_dec_ret
19845cdd308eSdjm
19855cdd308eSdjm	movdqa	@tweak[5],@tweak[0]
19865cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
19875cdd308eSdjm	pand	$twmask,@tweak[1]		# isolate carry and residue
19885cdd308eSdjm	pxor	@tweak[5],@tweak[1]
19895cdd308eSdjm	jmp	.Lxts_dec_done2
19905cdd308eSdjm
19915cdd308eSdjm.align	16
19925cdd308eSdjm.Lxts_dec_one:
19935cdd308eSdjm	movups	($inp),$inout0
19945cdd308eSdjm	lea	16*1($inp),$inp
19955cdd308eSdjm	xorps	@tweak[0],$inout0
19965cdd308eSdjm___
19975cdd308eSdjm	&aesni_generate1("dec",$key,$rounds);
19985cdd308eSdjm$code.=<<___;
19995cdd308eSdjm	xorps	@tweak[0],$inout0
20005cdd308eSdjm	movdqa	@tweak[1],@tweak[0]
20015cdd308eSdjm	movups	$inout0,($out)
20025cdd308eSdjm	movdqa	@tweak[2],@tweak[1]
20035cdd308eSdjm	lea	16*1($out),$out
20045cdd308eSdjm	jmp	.Lxts_dec_done
20055cdd308eSdjm
20065cdd308eSdjm.align	16
20075cdd308eSdjm.Lxts_dec_two:
20085cdd308eSdjm	movups	($inp),$inout0
20095cdd308eSdjm	movups	16($inp),$inout1
20105cdd308eSdjm	lea	32($inp),$inp
20115cdd308eSdjm	xorps	@tweak[0],$inout0
20125cdd308eSdjm	xorps	@tweak[1],$inout1
20135cdd308eSdjm
20145cdd308eSdjm	call	_aesni_decrypt3
20155cdd308eSdjm
20165cdd308eSdjm	xorps	@tweak[0],$inout0
20175cdd308eSdjm	movdqa	@tweak[2],@tweak[0]
20185cdd308eSdjm	xorps	@tweak[1],$inout1
20195cdd308eSdjm	movdqa	@tweak[3],@tweak[1]
20205cdd308eSdjm	movups	$inout0,($out)
20215cdd308eSdjm	movups	$inout1,16*1($out)
20225cdd308eSdjm	lea	16*2($out),$out
20235cdd308eSdjm	jmp	.Lxts_dec_done
20245cdd308eSdjm
20255cdd308eSdjm.align	16
20265cdd308eSdjm.Lxts_dec_three:
20275cdd308eSdjm	movups	($inp),$inout0
20285cdd308eSdjm	movups	16*1($inp),$inout1
20295cdd308eSdjm	movups	16*2($inp),$inout2
20305cdd308eSdjm	lea	16*3($inp),$inp
20315cdd308eSdjm	xorps	@tweak[0],$inout0
20325cdd308eSdjm	xorps	@tweak[1],$inout1
20335cdd308eSdjm	xorps	@tweak[2],$inout2
20345cdd308eSdjm
20355cdd308eSdjm	call	_aesni_decrypt3
20365cdd308eSdjm
20375cdd308eSdjm	xorps	@tweak[0],$inout0
20385cdd308eSdjm	movdqa	@tweak[3],@tweak[0]
20395cdd308eSdjm	xorps	@tweak[1],$inout1
20405cdd308eSdjm	movdqa	@tweak[5],@tweak[1]
20415cdd308eSdjm	xorps	@tweak[2],$inout2
20425cdd308eSdjm	movups	$inout0,($out)
20435cdd308eSdjm	movups	$inout1,16*1($out)
20445cdd308eSdjm	movups	$inout2,16*2($out)
20455cdd308eSdjm	lea	16*3($out),$out
20465cdd308eSdjm	jmp	.Lxts_dec_done
20475cdd308eSdjm
20485cdd308eSdjm.align	16
20495cdd308eSdjm.Lxts_dec_four:
20505cdd308eSdjm	pshufd	\$0x13,$twtmp,$twres
20515cdd308eSdjm	movdqa	@tweak[5],@tweak[4]
20525cdd308eSdjm	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
20535cdd308eSdjm	 movups	($inp),$inout0
20545cdd308eSdjm	pand	$twmask,$twres			# isolate carry and residue
20555cdd308eSdjm	 movups	16*1($inp),$inout1
20565cdd308eSdjm	pxor	$twres,@tweak[5]
20575cdd308eSdjm
20585cdd308eSdjm	movups	16*2($inp),$inout2
20595cdd308eSdjm	xorps	@tweak[0],$inout0
20605cdd308eSdjm	movups	16*3($inp),$inout3
20615cdd308eSdjm	lea	16*4($inp),$inp
20625cdd308eSdjm	xorps	@tweak[1],$inout1
20635cdd308eSdjm	xorps	@tweak[2],$inout2
20645cdd308eSdjm	xorps	@tweak[3],$inout3
20655cdd308eSdjm
20665cdd308eSdjm	call	_aesni_decrypt4
20675cdd308eSdjm
20685cdd308eSdjm	xorps	@tweak[0],$inout0
20695cdd308eSdjm	movdqa	@tweak[4],@tweak[0]
20705cdd308eSdjm	xorps	@tweak[1],$inout1
20715cdd308eSdjm	movdqa	@tweak[5],@tweak[1]
20725cdd308eSdjm	xorps	@tweak[2],$inout2
20735cdd308eSdjm	movups	$inout0,($out)
20745cdd308eSdjm	xorps	@tweak[3],$inout3
20755cdd308eSdjm	movups	$inout1,16*1($out)
20765cdd308eSdjm	movups	$inout2,16*2($out)
20775cdd308eSdjm	movups	$inout3,16*3($out)
20785cdd308eSdjm	lea	16*4($out),$out
20795cdd308eSdjm	jmp	.Lxts_dec_done
20805cdd308eSdjm
20815cdd308eSdjm.align	16
20825cdd308eSdjm.Lxts_dec_done:
20835cdd308eSdjm	and	\$15,$len_
20845cdd308eSdjm	jz	.Lxts_dec_ret
20855cdd308eSdjm.Lxts_dec_done2:
20865cdd308eSdjm	mov	$len_,$len
20875cdd308eSdjm	mov	$key_,$key			# restore $key
20885cdd308eSdjm	mov	$rnds_,$rounds			# restore $rounds
20895cdd308eSdjm
20905cdd308eSdjm	movups	($inp),$inout0
20915cdd308eSdjm	xorps	@tweak[1],$inout0
20925cdd308eSdjm___
20935cdd308eSdjm	&aesni_generate1("dec",$key,$rounds);
20945cdd308eSdjm$code.=<<___;
20955cdd308eSdjm	xorps	@tweak[1],$inout0
20965cdd308eSdjm	movups	$inout0,($out)
20975cdd308eSdjm
20985cdd308eSdjm.Lxts_dec_steal:
20995cdd308eSdjm	movzb	16($inp),%eax			# borrow $rounds ...
21005cdd308eSdjm	movzb	($out),%ecx			# ... and $key
21015cdd308eSdjm	lea	1($inp),$inp
21025cdd308eSdjm	mov	%al,($out)
21035cdd308eSdjm	mov	%cl,16($out)
21045cdd308eSdjm	lea	1($out),$out
21055cdd308eSdjm	sub	\$1,$len
21065cdd308eSdjm	jnz	.Lxts_dec_steal
21075cdd308eSdjm
21085cdd308eSdjm	sub	$len_,$out			# rewind $out
21095cdd308eSdjm	mov	$key_,$key			# restore $key
21105cdd308eSdjm	mov	$rnds_,$rounds			# restore $rounds
21115cdd308eSdjm
21125cdd308eSdjm	movups	($out),$inout0
21135cdd308eSdjm	xorps	@tweak[0],$inout0
21145cdd308eSdjm___
21155cdd308eSdjm	&aesni_generate1("dec",$key,$rounds);
21165cdd308eSdjm$code.=<<___;
21175cdd308eSdjm	xorps	@tweak[0],$inout0
21185cdd308eSdjm	movups	$inout0,($out)
21195cdd308eSdjm
21205cdd308eSdjm.Lxts_dec_ret:
21215cdd308eSdjm___
21225cdd308eSdjm$code.=<<___ if ($win64);
21235cdd308eSdjm	movaps	0x60(%rsp),%xmm6
21245cdd308eSdjm	movaps	0x70(%rsp),%xmm7
21255cdd308eSdjm	movaps	0x80(%rsp),%xmm8
21265cdd308eSdjm	movaps	0x90(%rsp),%xmm9
21275cdd308eSdjm	movaps	0xa0(%rsp),%xmm10
21285cdd308eSdjm	movaps	0xb0(%rsp),%xmm11
21295cdd308eSdjm	movaps	0xc0(%rsp),%xmm12
21305cdd308eSdjm	movaps	0xd0(%rsp),%xmm13
21315cdd308eSdjm	movaps	0xe0(%rsp),%xmm14
21325cdd308eSdjm	movaps	0xf0(%rsp),%xmm15
21335cdd308eSdjm___
21345cdd308eSdjm$code.=<<___;
21355c104365Sjsing	lea	(%rbp),%rsp
21365c104365Sjsing	pop	%rbp
21375cdd308eSdjm.Lxts_dec_epilogue:
21385cdd308eSdjm	ret
21395cdd308eSdjm.size	aesni_xts_decrypt,.-aesni_xts_decrypt
21405cdd308eSdjm___
21415cdd308eSdjm} }}
21425cdd308eSdjm
21435cdd308eSdjm########################################################################
21446249468aSthib# void $PREFIX_cbc_encrypt (const void *inp, void *out,
21456249468aSthib#			    size_t length, const AES_KEY *key,
21466249468aSthib#			    unsigned char *ivp,const int enc);
21475cdd308eSdjm{
21485c104365Sjsingmy $frame_size = 0x10 + ($win64?0x40:0);	# used in decrypt
21496249468aSthib$code.=<<___;
21506249468aSthib.globl	${PREFIX}_cbc_encrypt
21516249468aSthib.type	${PREFIX}_cbc_encrypt,\@function,6
21526249468aSthib.align	16
21536249468aSthib${PREFIX}_cbc_encrypt:
2154*5caf18b2Stb	_CET_ENDBR
21556249468aSthib	test	$len,$len		# check length
21566249468aSthib	jz	.Lcbc_ret
21576249468aSthib
21585cdd308eSdjm	mov	240($key),$rnds_	# key->rounds
21596249468aSthib	mov	$key,$key_		# backup $key
21606249468aSthib	test	%r9d,%r9d		# 6th argument
21616249468aSthib	jz	.Lcbc_decrypt
21626249468aSthib#--------------------------- CBC ENCRYPT ------------------------------#
21636249468aSthib	movups	($ivp),$inout0		# load iv as initial state
21646249468aSthib	mov	$rnds_,$rounds
21655cdd308eSdjm	cmp	\$16,$len
21666249468aSthib	jb	.Lcbc_enc_tail
21676249468aSthib	sub	\$16,$len
21686249468aSthib	jmp	.Lcbc_enc_loop
21696249468aSthib.align	16
21706249468aSthib.Lcbc_enc_loop:
21716249468aSthib	movups	($inp),$inout1		# load input
21726249468aSthib	lea	16($inp),$inp
21735cdd308eSdjm	#xorps	$inout1,$inout0
21746249468aSthib___
21755cdd308eSdjm	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
21766249468aSthib$code.=<<___;
21776249468aSthib	mov	$rnds_,$rounds		# restore $rounds
21786249468aSthib	mov	$key_,$key		# restore $key
21795cdd308eSdjm	movups	$inout0,0($out)		# store output
21805cdd308eSdjm	lea	16($out),$out
21815cdd308eSdjm	sub	\$16,$len
21826249468aSthib	jnc	.Lcbc_enc_loop
21836249468aSthib	add	\$16,$len
21846249468aSthib	jnz	.Lcbc_enc_tail
21856249468aSthib	movups	$inout0,($ivp)
21866249468aSthib	jmp	.Lcbc_ret
21876249468aSthib
21886249468aSthib.Lcbc_enc_tail:
21896249468aSthib	mov	$len,%rcx	# zaps $key
21906249468aSthib	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
21916249468aSthib	.long	0x9066A4F3	# rep movsb
21926249468aSthib	mov	\$16,%ecx	# zero tail
21936249468aSthib	sub	$len,%rcx
21946249468aSthib	xor	%eax,%eax
21956249468aSthib	.long	0x9066AAF3	# rep stosb
21966249468aSthib	lea	-16(%rdi),%rdi	# rewind $out by 1 block
21976249468aSthib	mov	$rnds_,$rounds	# restore $rounds
21986249468aSthib	mov	%rdi,%rsi	# $inp and $out are the same
21996249468aSthib	mov	$key_,$key	# restore $key
22006249468aSthib	xor	$len,$len	# len=16
22016249468aSthib	jmp	.Lcbc_enc_loop	# one more spin
22026249468aSthib#--------------------------- CBC DECRYPT ------------------------------#
22036249468aSthib.align	16
22046249468aSthib.Lcbc_decrypt:
22055c104365Sjsing	lea	(%rsp),%rax
22065c104365Sjsing	push	%rbp
22075c104365Sjsing	sub	\$$frame_size,%rsp
22086249468aSthib___
22096249468aSthib$code.=<<___ if ($win64);
22105c104365Sjsing	movaps	%xmm6,0x10(%rsp)
22115c104365Sjsing	movaps	%xmm7,0x20(%rsp)
22125c104365Sjsing	movaps	%xmm8,0x30(%rsp)
22135c104365Sjsing	movaps	%xmm9,0x40(%rsp)
22146249468aSthib.Lcbc_decrypt_body:
22156249468aSthib___
22166249468aSthib$code.=<<___;
22175c104365Sjsing	lea	-8(%rax),%rbp
22186249468aSthib	movups	($ivp),$iv
22196249468aSthib	mov	$rnds_,$rounds
22205cdd308eSdjm	cmp	\$0x70,$len
22216249468aSthib	jbe	.Lcbc_dec_tail
22225cdd308eSdjm	shr	\$1,$rnds_
22235cdd308eSdjm	sub	\$0x70,$len
22245cdd308eSdjm	mov	$rnds_,$rounds
22255c104365Sjsing	movaps	$iv,(%rsp)
22265cdd308eSdjm	jmp	.Lcbc_dec_loop8_enter
22276249468aSthib.align	16
22285cdd308eSdjm.Lcbc_dec_loop8:
22295c104365Sjsing	movaps	$rndkey0,(%rsp)			# save IV
22305cdd308eSdjm	movups	$inout7,($out)
22315cdd308eSdjm	lea	0x10($out),$out
22325cdd308eSdjm.Lcbc_dec_loop8_enter:
22335cdd308eSdjm	$movkey		($key),$rndkey0
22345cdd308eSdjm	movups	($inp),$inout0			# load input
22356249468aSthib	movups	0x10($inp),$inout1
22365cdd308eSdjm	$movkey		16($key),$rndkey1
22376249468aSthib
22385cdd308eSdjm	lea		32($key),$key
22395cdd308eSdjm	movdqu	0x20($inp),$inout2
22405cdd308eSdjm	xorps		$rndkey0,$inout0
22415cdd308eSdjm	movdqu	0x30($inp),$inout3
22425cdd308eSdjm	xorps		$rndkey0,$inout1
22435cdd308eSdjm	movdqu	0x40($inp),$inout4
22445cdd308eSdjm	aesdec		$rndkey1,$inout0
22455cdd308eSdjm	pxor		$rndkey0,$inout2
22465cdd308eSdjm	movdqu	0x50($inp),$inout5
22475cdd308eSdjm	aesdec		$rndkey1,$inout1
22485cdd308eSdjm	pxor		$rndkey0,$inout3
22495cdd308eSdjm	movdqu	0x60($inp),$inout6
22505cdd308eSdjm	aesdec		$rndkey1,$inout2
22515cdd308eSdjm	pxor		$rndkey0,$inout4
22525cdd308eSdjm	movdqu	0x70($inp),$inout7
22535cdd308eSdjm	aesdec		$rndkey1,$inout3
22545cdd308eSdjm	pxor		$rndkey0,$inout5
22555cdd308eSdjm	dec		$rounds
22565cdd308eSdjm	aesdec		$rndkey1,$inout4
22575cdd308eSdjm	pxor		$rndkey0,$inout6
22585cdd308eSdjm	aesdec		$rndkey1,$inout5
22595cdd308eSdjm	pxor		$rndkey0,$inout7
22605cdd308eSdjm	$movkey		($key),$rndkey0
22615cdd308eSdjm	aesdec		$rndkey1,$inout6
22625cdd308eSdjm	aesdec		$rndkey1,$inout7
22635cdd308eSdjm	$movkey		16($key),$rndkey1
22646249468aSthib
22655cdd308eSdjm	call		.Ldec_loop8_enter
22665cdd308eSdjm
22675cdd308eSdjm	movups	($inp),$rndkey1		# re-load input
22685cdd308eSdjm	movups	0x10($inp),$rndkey0
22695c104365Sjsing	xorps	(%rsp),$inout0		# ^= IV
22705cdd308eSdjm	xorps	$rndkey1,$inout1
22715cdd308eSdjm	movups	0x20($inp),$rndkey1
22725cdd308eSdjm	xorps	$rndkey0,$inout2
22735cdd308eSdjm	movups	0x30($inp),$rndkey0
22745cdd308eSdjm	xorps	$rndkey1,$inout3
22755cdd308eSdjm	movups	0x40($inp),$rndkey1
22765cdd308eSdjm	xorps	$rndkey0,$inout4
22775cdd308eSdjm	movups	0x50($inp),$rndkey0
22785cdd308eSdjm	xorps	$rndkey1,$inout5
22795cdd308eSdjm	movups	0x60($inp),$rndkey1
22805cdd308eSdjm	xorps	$rndkey0,$inout6
22815cdd308eSdjm	movups	0x70($inp),$rndkey0	# IV
22825cdd308eSdjm	xorps	$rndkey1,$inout7
22836249468aSthib	movups	$inout0,($out)
22846249468aSthib	movups	$inout1,0x10($out)
22856249468aSthib	movups	$inout2,0x20($out)
22865cdd308eSdjm	movups	$inout3,0x30($out)
22875cdd308eSdjm	mov	$rnds_,$rounds		# restore $rounds
22885cdd308eSdjm	movups	$inout4,0x40($out)
22895cdd308eSdjm	mov	$key_,$key		# restore $key
22905cdd308eSdjm	movups	$inout5,0x50($out)
22915cdd308eSdjm	lea	0x80($inp),$inp
22925cdd308eSdjm	movups	$inout6,0x60($out)
22935cdd308eSdjm	lea	0x70($out),$out
22945cdd308eSdjm	sub	\$0x80,$len
22955cdd308eSdjm	ja	.Lcbc_dec_loop8
22965cdd308eSdjm
22975cdd308eSdjm	movaps	$inout7,$inout0
22985cdd308eSdjm	movaps	$rndkey0,$iv
22995cdd308eSdjm	add	\$0x70,$len
23005cdd308eSdjm	jle	.Lcbc_dec_tail_collected
23015cdd308eSdjm	movups	$inout0,($out)
23025cdd308eSdjm	lea	1($rnds_,$rnds_),$rounds
23035cdd308eSdjm	lea	0x10($out),$out
23045cdd308eSdjm.Lcbc_dec_tail:
23055cdd308eSdjm	movups	($inp),$inout0
23065cdd308eSdjm	movaps	$inout0,$in0
23075cdd308eSdjm	cmp	\$0x10,$len
23085cdd308eSdjm	jbe	.Lcbc_dec_one
23095cdd308eSdjm
23105cdd308eSdjm	movups	0x10($inp),$inout1
23115cdd308eSdjm	movaps	$inout1,$in1
23125cdd308eSdjm	cmp	\$0x20,$len
23135cdd308eSdjm	jbe	.Lcbc_dec_two
23145cdd308eSdjm
23155cdd308eSdjm	movups	0x20($inp),$inout2
23165cdd308eSdjm	movaps	$inout2,$in2
23175cdd308eSdjm	cmp	\$0x30,$len
23185cdd308eSdjm	jbe	.Lcbc_dec_three
23195cdd308eSdjm
23205cdd308eSdjm	movups	0x30($inp),$inout3
23215cdd308eSdjm	cmp	\$0x40,$len
23225cdd308eSdjm	jbe	.Lcbc_dec_four
23235cdd308eSdjm
23245cdd308eSdjm	movups	0x40($inp),$inout4
23255cdd308eSdjm	cmp	\$0x50,$len
23265cdd308eSdjm	jbe	.Lcbc_dec_five
23275cdd308eSdjm
23285cdd308eSdjm	movups	0x50($inp),$inout5
23295cdd308eSdjm	cmp	\$0x60,$len
23305cdd308eSdjm	jbe	.Lcbc_dec_six
23315cdd308eSdjm
23325cdd308eSdjm	movups	0x60($inp),$inout6
23335c104365Sjsing	movaps	$iv,(%rsp)		# save IV
23345cdd308eSdjm	call	_aesni_decrypt8
23355cdd308eSdjm	movups	($inp),$rndkey1
23365cdd308eSdjm	movups	0x10($inp),$rndkey0
23375c104365Sjsing	xorps	(%rsp),$inout0		# ^= IV
23385cdd308eSdjm	xorps	$rndkey1,$inout1
23395cdd308eSdjm	movups	0x20($inp),$rndkey1
23405cdd308eSdjm	xorps	$rndkey0,$inout2
23415cdd308eSdjm	movups	0x30($inp),$rndkey0
23425cdd308eSdjm	xorps	$rndkey1,$inout3
23435cdd308eSdjm	movups	0x40($inp),$rndkey1
23445cdd308eSdjm	xorps	$rndkey0,$inout4
23455cdd308eSdjm	movups	0x50($inp),$rndkey0
23465cdd308eSdjm	xorps	$rndkey1,$inout5
23475cdd308eSdjm	movups	0x60($inp),$iv		# IV
23485cdd308eSdjm	xorps	$rndkey0,$inout6
23495cdd308eSdjm	movups	$inout0,($out)
23505cdd308eSdjm	movups	$inout1,0x10($out)
23515cdd308eSdjm	movups	$inout2,0x20($out)
23525cdd308eSdjm	movups	$inout3,0x30($out)
23535cdd308eSdjm	movups	$inout4,0x40($out)
23545cdd308eSdjm	movups	$inout5,0x50($out)
23555cdd308eSdjm	lea	0x60($out),$out
23565cdd308eSdjm	movaps	$inout6,$inout0
23575cdd308eSdjm	sub	\$0x70,$len
23586249468aSthib	jmp	.Lcbc_dec_tail_collected
23596249468aSthib.align	16
23606249468aSthib.Lcbc_dec_one:
23616249468aSthib___
23626249468aSthib	&aesni_generate1("dec",$key,$rounds);
23636249468aSthib$code.=<<___;
23645cdd308eSdjm	xorps	$iv,$inout0
23656249468aSthib	movaps	$in0,$iv
23665cdd308eSdjm	sub	\$0x10,$len
23676249468aSthib	jmp	.Lcbc_dec_tail_collected
23686249468aSthib.align	16
23696249468aSthib.Lcbc_dec_two:
23705cdd308eSdjm	xorps	$inout2,$inout2
23716249468aSthib	call	_aesni_decrypt3
23725cdd308eSdjm	xorps	$iv,$inout0
23735cdd308eSdjm	xorps	$in0,$inout1
23746249468aSthib	movups	$inout0,($out)
23756249468aSthib	movaps	$in1,$iv
23766249468aSthib	movaps	$inout1,$inout0
23776249468aSthib	lea	0x10($out),$out
23785cdd308eSdjm	sub	\$0x20,$len
23796249468aSthib	jmp	.Lcbc_dec_tail_collected
23806249468aSthib.align	16
23816249468aSthib.Lcbc_dec_three:
23826249468aSthib	call	_aesni_decrypt3
23835cdd308eSdjm	xorps	$iv,$inout0
23845cdd308eSdjm	xorps	$in0,$inout1
23856249468aSthib	movups	$inout0,($out)
23865cdd308eSdjm	xorps	$in1,$inout2
23876249468aSthib	movups	$inout1,0x10($out)
23886249468aSthib	movaps	$in2,$iv
23896249468aSthib	movaps	$inout2,$inout0
23906249468aSthib	lea	0x20($out),$out
23915cdd308eSdjm	sub	\$0x30,$len
23925cdd308eSdjm	jmp	.Lcbc_dec_tail_collected
23935cdd308eSdjm.align	16
23945cdd308eSdjm.Lcbc_dec_four:
23955cdd308eSdjm	call	_aesni_decrypt4
23965cdd308eSdjm	xorps	$iv,$inout0
23975cdd308eSdjm	movups	0x30($inp),$iv
23985cdd308eSdjm	xorps	$in0,$inout1
23995cdd308eSdjm	movups	$inout0,($out)
24005cdd308eSdjm	xorps	$in1,$inout2
24015cdd308eSdjm	movups	$inout1,0x10($out)
24025cdd308eSdjm	xorps	$in2,$inout3
24035cdd308eSdjm	movups	$inout2,0x20($out)
24045cdd308eSdjm	movaps	$inout3,$inout0
24055cdd308eSdjm	lea	0x30($out),$out
24065cdd308eSdjm	sub	\$0x40,$len
24075cdd308eSdjm	jmp	.Lcbc_dec_tail_collected
24085cdd308eSdjm.align	16
24095cdd308eSdjm.Lcbc_dec_five:
24105cdd308eSdjm	xorps	$inout5,$inout5
24115cdd308eSdjm	call	_aesni_decrypt6
24125cdd308eSdjm	movups	0x10($inp),$rndkey1
24135cdd308eSdjm	movups	0x20($inp),$rndkey0
24145cdd308eSdjm	xorps	$iv,$inout0
24155cdd308eSdjm	xorps	$in0,$inout1
24165cdd308eSdjm	xorps	$rndkey1,$inout2
24175cdd308eSdjm	movups	0x30($inp),$rndkey1
24185cdd308eSdjm	xorps	$rndkey0,$inout3
24195cdd308eSdjm	movups	0x40($inp),$iv
24205cdd308eSdjm	xorps	$rndkey1,$inout4
24215cdd308eSdjm	movups	$inout0,($out)
24225cdd308eSdjm	movups	$inout1,0x10($out)
24235cdd308eSdjm	movups	$inout2,0x20($out)
24245cdd308eSdjm	movups	$inout3,0x30($out)
24255cdd308eSdjm	lea	0x40($out),$out
24265cdd308eSdjm	movaps	$inout4,$inout0
24275cdd308eSdjm	sub	\$0x50,$len
24285cdd308eSdjm	jmp	.Lcbc_dec_tail_collected
24295cdd308eSdjm.align	16
24305cdd308eSdjm.Lcbc_dec_six:
24315cdd308eSdjm	call	_aesni_decrypt6
24325cdd308eSdjm	movups	0x10($inp),$rndkey1
24335cdd308eSdjm	movups	0x20($inp),$rndkey0
24345cdd308eSdjm	xorps	$iv,$inout0
24355cdd308eSdjm	xorps	$in0,$inout1
24365cdd308eSdjm	xorps	$rndkey1,$inout2
24375cdd308eSdjm	movups	0x30($inp),$rndkey1
24385cdd308eSdjm	xorps	$rndkey0,$inout3
24395cdd308eSdjm	movups	0x40($inp),$rndkey0
24405cdd308eSdjm	xorps	$rndkey1,$inout4
24415cdd308eSdjm	movups	0x50($inp),$iv
24425cdd308eSdjm	xorps	$rndkey0,$inout5
24435cdd308eSdjm	movups	$inout0,($out)
24445cdd308eSdjm	movups	$inout1,0x10($out)
24455cdd308eSdjm	movups	$inout2,0x20($out)
24465cdd308eSdjm	movups	$inout3,0x30($out)
24475cdd308eSdjm	movups	$inout4,0x40($out)
24485cdd308eSdjm	lea	0x50($out),$out
24495cdd308eSdjm	movaps	$inout5,$inout0
24505cdd308eSdjm	sub	\$0x60,$len
24516249468aSthib	jmp	.Lcbc_dec_tail_collected
24526249468aSthib.align	16
24536249468aSthib.Lcbc_dec_tail_collected:
24546249468aSthib	and	\$15,$len
24556249468aSthib	movups	$iv,($ivp)
24566249468aSthib	jnz	.Lcbc_dec_tail_partial
24576249468aSthib	movups	$inout0,($out)
24586249468aSthib	jmp	.Lcbc_dec_ret
24595cdd308eSdjm.align	16
24606249468aSthib.Lcbc_dec_tail_partial:
24615c104365Sjsing	movaps	$inout0,(%rsp)
24625cdd308eSdjm	mov	\$16,%rcx
24636249468aSthib	mov	$out,%rdi
24645cdd308eSdjm	sub	$len,%rcx
24655c104365Sjsing	lea	(%rsp),%rsi
24666249468aSthib	.long	0x9066A4F3	# rep movsb
24676249468aSthib
24686249468aSthib.Lcbc_dec_ret:
24696249468aSthib___
24706249468aSthib$code.=<<___ if ($win64);
24715c104365Sjsing	movaps	0x10(%rsp),%xmm6
24725c104365Sjsing	movaps	0x20(%rsp),%xmm7
24735c104365Sjsing	movaps	0x30(%rsp),%xmm8
24745c104365Sjsing	movaps	0x40(%rsp),%xmm9
24756249468aSthib___
24766249468aSthib$code.=<<___;
24775c104365Sjsing	lea	(%rbp),%rsp
24785c104365Sjsing	pop	%rbp
24796249468aSthib.Lcbc_ret:
24806249468aSthib	ret
24816249468aSthib.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
24826249468aSthib___
24835cdd308eSdjm}
24846249468aSthib# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
24856249468aSthib#				int bits, AES_KEY *key)
24866249468aSthib{ my ($inp,$bits,$key) = @_4args;
24876249468aSthib  $bits =~ s/%r/%e/;
24886249468aSthib
24896249468aSthib$code.=<<___;
24906249468aSthib.globl	${PREFIX}_set_decrypt_key
24916249468aSthib.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
24926249468aSthib.align	16
24936249468aSthib${PREFIX}_set_decrypt_key:
249422787c51Stb	_CET_ENDBR
2495b407f3afSmiod	sub	\$8,%rsp
24965cdd308eSdjm	call	__aesni_set_encrypt_key
24976249468aSthib	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
24986249468aSthib	test	%eax,%eax
24996249468aSthib	jnz	.Ldec_key_ret
25006249468aSthib	lea	16($key,$bits),$inp	# points at the end of key schedule
25016249468aSthib
25026249468aSthib	$movkey	($key),%xmm0		# just swap
25036249468aSthib	$movkey	($inp),%xmm1
25046249468aSthib	$movkey	%xmm0,($inp)
25056249468aSthib	$movkey	%xmm1,($key)
25066249468aSthib	lea	16($key),$key
25076249468aSthib	lea	-16($inp),$inp
25086249468aSthib
25096249468aSthib.Ldec_key_inverse:
25106249468aSthib	$movkey	($key),%xmm0		# swap and inverse
25116249468aSthib	$movkey	($inp),%xmm1
25126249468aSthib	aesimc	%xmm0,%xmm0
25136249468aSthib	aesimc	%xmm1,%xmm1
25146249468aSthib	lea	16($key),$key
25156249468aSthib	lea	-16($inp),$inp
25166249468aSthib	$movkey	%xmm0,16($inp)
25176249468aSthib	$movkey	%xmm1,-16($key)
25185cdd308eSdjm	cmp	$key,$inp
25196249468aSthib	ja	.Ldec_key_inverse
25206249468aSthib
25216249468aSthib	$movkey	($key),%xmm0		# inverse middle
25226249468aSthib	aesimc	%xmm0,%xmm0
25236249468aSthib	$movkey	%xmm0,($inp)
25246249468aSthib.Ldec_key_ret:
25256249468aSthib	add	\$8,%rsp
25266249468aSthib	ret
25276249468aSthib.LSEH_end_set_decrypt_key:
25286249468aSthib.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
25296249468aSthib___
25306249468aSthib
25316249468aSthib# This is based on submission by
25326249468aSthib#
25336249468aSthib#	Huang Ying <ying.huang@intel.com>
25346249468aSthib#	Vinodh Gopal <vinodh.gopal@intel.com>
25356249468aSthib#	Kahraman Akdemir
25366249468aSthib#
253771743258Sjmc# Aggressively optimized in respect to aeskeygenassist's critical path
25386249468aSthib# and is contained in %xmm0-5 to meet Win64 ABI requirement.
25396249468aSthib#
25406249468aSthib$code.=<<___;
25416249468aSthib.globl	${PREFIX}_set_encrypt_key
25426249468aSthib.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
25436249468aSthib.align	16
25446249468aSthib${PREFIX}_set_encrypt_key:
254522787c51Stb	_CET_ENDBR
25465cdd308eSdjm__aesni_set_encrypt_key:
2547b407f3afSmiod	sub	\$8,%rsp
25486249468aSthib	mov	\$-1,%rax
25495cdd308eSdjm	test	$inp,$inp
25506249468aSthib	jz	.Lenc_key_ret
25516249468aSthib	test	$key,$key
25526249468aSthib	jz	.Lenc_key_ret
25536249468aSthib
25546249468aSthib	movups	($inp),%xmm0		# pull first 128 bits of *userKey
25555cdd308eSdjm	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
25566249468aSthib	lea	16($key),%rax
25576249468aSthib	cmp	\$256,$bits
25586249468aSthib	je	.L14rounds
25596249468aSthib	cmp	\$192,$bits
25606249468aSthib	je	.L12rounds
25616249468aSthib	cmp	\$128,$bits
25626249468aSthib	jne	.Lbad_keybits
25636249468aSthib
25646249468aSthib.L10rounds:
25656249468aSthib	mov	\$9,$bits			# 10 rounds for 128-bit key
25666249468aSthib	$movkey	%xmm0,($key)			# round 0
25676249468aSthib	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
25686249468aSthib	call		.Lkey_expansion_128_cold
25696249468aSthib	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
25706249468aSthib	call		.Lkey_expansion_128
25716249468aSthib	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
25726249468aSthib	call		.Lkey_expansion_128
25736249468aSthib	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
25746249468aSthib	call		.Lkey_expansion_128
25756249468aSthib	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
25766249468aSthib	call		.Lkey_expansion_128
25776249468aSthib	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
25786249468aSthib	call		.Lkey_expansion_128
25796249468aSthib	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
25806249468aSthib	call		.Lkey_expansion_128
25816249468aSthib	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
25826249468aSthib	call		.Lkey_expansion_128
25836249468aSthib	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
25846249468aSthib	call		.Lkey_expansion_128
25856249468aSthib	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
25866249468aSthib	call		.Lkey_expansion_128
25876249468aSthib	$movkey	%xmm0,(%rax)
25886249468aSthib	mov	$bits,80(%rax)	# 240(%rdx)
25896249468aSthib	xor	%eax,%eax
25906249468aSthib	jmp	.Lenc_key_ret
25916249468aSthib
25926249468aSthib.align	16
25936249468aSthib.L12rounds:
25946249468aSthib	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
25956249468aSthib	mov	\$11,$bits			# 12 rounds for 192
25966249468aSthib	$movkey	%xmm0,($key)			# round 0
25976249468aSthib	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
25986249468aSthib	call		.Lkey_expansion_192a_cold
25996249468aSthib	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
26006249468aSthib	call		.Lkey_expansion_192b
26016249468aSthib	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
26026249468aSthib	call		.Lkey_expansion_192a
26036249468aSthib	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
26046249468aSthib	call		.Lkey_expansion_192b
26056249468aSthib	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
26066249468aSthib	call		.Lkey_expansion_192a
26076249468aSthib	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
26086249468aSthib	call		.Lkey_expansion_192b
26096249468aSthib	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
26106249468aSthib	call		.Lkey_expansion_192a
26116249468aSthib	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
26126249468aSthib	call		.Lkey_expansion_192b
26136249468aSthib	$movkey	%xmm0,(%rax)
26146249468aSthib	mov	$bits,48(%rax)	# 240(%rdx)
26156249468aSthib	xor	%rax, %rax
26166249468aSthib	jmp	.Lenc_key_ret
26176249468aSthib
26186249468aSthib.align	16
26196249468aSthib.L14rounds:
262071743258Sjmc	movups	16($inp),%xmm2			# remaining half of *userKey
26216249468aSthib	mov	\$13,$bits			# 14 rounds for 256
26226249468aSthib	lea	16(%rax),%rax
26236249468aSthib	$movkey	%xmm0,($key)			# round 0
26246249468aSthib	$movkey	%xmm2,16($key)			# round 1
26256249468aSthib	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
26266249468aSthib	call		.Lkey_expansion_256a_cold
26276249468aSthib	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
26286249468aSthib	call		.Lkey_expansion_256b
26296249468aSthib	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
26306249468aSthib	call		.Lkey_expansion_256a
26316249468aSthib	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
26326249468aSthib	call		.Lkey_expansion_256b
26336249468aSthib	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
26346249468aSthib	call		.Lkey_expansion_256a
26356249468aSthib	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
26366249468aSthib	call		.Lkey_expansion_256b
26376249468aSthib	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
26386249468aSthib	call		.Lkey_expansion_256a
26396249468aSthib	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
26406249468aSthib	call		.Lkey_expansion_256b
26416249468aSthib	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
26426249468aSthib	call		.Lkey_expansion_256a
26436249468aSthib	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
26446249468aSthib	call		.Lkey_expansion_256b
26456249468aSthib	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
26466249468aSthib	call		.Lkey_expansion_256a
26476249468aSthib	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
26486249468aSthib	call		.Lkey_expansion_256b
26496249468aSthib	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
26506249468aSthib	call		.Lkey_expansion_256a
26516249468aSthib	$movkey	%xmm0,(%rax)
26526249468aSthib	mov	$bits,16(%rax)	# 240(%rdx)
26536249468aSthib	xor	%rax,%rax
26546249468aSthib	jmp	.Lenc_key_ret
26556249468aSthib
26566249468aSthib.align	16
26576249468aSthib.Lbad_keybits:
26586249468aSthib	mov	\$-2,%rax
26596249468aSthib.Lenc_key_ret:
26606249468aSthib	add	\$8,%rsp
26616249468aSthib	ret
26626249468aSthib.LSEH_end_set_encrypt_key:
26636249468aSthib
26646249468aSthib.align	16
26656249468aSthib.Lkey_expansion_128:
26666249468aSthib	$movkey	%xmm0,(%rax)
26676249468aSthib	lea	16(%rax),%rax
26686249468aSthib.Lkey_expansion_128_cold:
26696249468aSthib	shufps	\$0b00010000,%xmm0,%xmm4
26705cdd308eSdjm	xorps	%xmm4, %xmm0
26716249468aSthib	shufps	\$0b10001100,%xmm0,%xmm4
26725cdd308eSdjm	xorps	%xmm4, %xmm0
26735cdd308eSdjm	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
26745cdd308eSdjm	xorps	%xmm1,%xmm0
26756249468aSthib	ret
26766249468aSthib
26776249468aSthib.align 16
26786249468aSthib.Lkey_expansion_192a:
26796249468aSthib	$movkey	%xmm0,(%rax)
26806249468aSthib	lea	16(%rax),%rax
26816249468aSthib.Lkey_expansion_192a_cold:
26826249468aSthib	movaps	%xmm2, %xmm5
26836249468aSthib.Lkey_expansion_192b_warm:
26846249468aSthib	shufps	\$0b00010000,%xmm0,%xmm4
26855cdd308eSdjm	movdqa	%xmm2,%xmm3
26865cdd308eSdjm	xorps	%xmm4,%xmm0
26876249468aSthib	shufps	\$0b10001100,%xmm0,%xmm4
26886249468aSthib	pslldq	\$4,%xmm3
26895cdd308eSdjm	xorps	%xmm4,%xmm0
26906249468aSthib	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
26916249468aSthib	pxor	%xmm3,%xmm2
26926249468aSthib	pxor	%xmm1,%xmm0
26936249468aSthib	pshufd	\$0b11111111,%xmm0,%xmm3
26946249468aSthib	pxor	%xmm3,%xmm2
26956249468aSthib	ret
26966249468aSthib
26976249468aSthib.align 16
26986249468aSthib.Lkey_expansion_192b:
26996249468aSthib	movaps	%xmm0,%xmm3
27006249468aSthib	shufps	\$0b01000100,%xmm0,%xmm5
27016249468aSthib	$movkey	%xmm5,(%rax)
27026249468aSthib	shufps	\$0b01001110,%xmm2,%xmm3
27036249468aSthib	$movkey	%xmm3,16(%rax)
27046249468aSthib	lea	32(%rax),%rax
27056249468aSthib	jmp	.Lkey_expansion_192b_warm
27066249468aSthib
27076249468aSthib.align	16
27086249468aSthib.Lkey_expansion_256a:
27096249468aSthib	$movkey	%xmm2,(%rax)
27106249468aSthib	lea	16(%rax),%rax
27116249468aSthib.Lkey_expansion_256a_cold:
27126249468aSthib	shufps	\$0b00010000,%xmm0,%xmm4
27135cdd308eSdjm	xorps	%xmm4,%xmm0
27146249468aSthib	shufps	\$0b10001100,%xmm0,%xmm4
27155cdd308eSdjm	xorps	%xmm4,%xmm0
27165cdd308eSdjm	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
27175cdd308eSdjm	xorps	%xmm1,%xmm0
27186249468aSthib	ret
27196249468aSthib
27206249468aSthib.align 16
27216249468aSthib.Lkey_expansion_256b:
27226249468aSthib	$movkey	%xmm0,(%rax)
27236249468aSthib	lea	16(%rax),%rax
27246249468aSthib
27256249468aSthib	shufps	\$0b00010000,%xmm2,%xmm4
27265cdd308eSdjm	xorps	%xmm4,%xmm2
27276249468aSthib	shufps	\$0b10001100,%xmm2,%xmm4
27285cdd308eSdjm	xorps	%xmm4,%xmm2
27295cdd308eSdjm	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
27305cdd308eSdjm	xorps	%xmm1,%xmm2
27316249468aSthib	ret
27326249468aSthib.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
27335cdd308eSdjm.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
27346249468aSthib___
27356249468aSthib}
27366249468aSthib
27376249468aSthib$code.=<<___;
2738eda85684Stb.section .rodata
27395cdd308eSdjm.align	64
27405cdd308eSdjm.Lbswap_mask:
27415cdd308eSdjm	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
27425cdd308eSdjm.Lincrement32:
27435cdd308eSdjm	.long	6,6,6,0
27445cdd308eSdjm.Lincrement64:
27455cdd308eSdjm	.long	1,0,0,0
27465cdd308eSdjm.Lxts_magic:
27475cdd308eSdjm	.long	0x87,0,1,0
27486249468aSthib.align	64
274908705922Stb.text
27506249468aSthib___
27516249468aSthib
27526249468aSthib# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
27536249468aSthib#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
27546249468aSthibif ($win64) {
27556249468aSthib$rec="%rcx";
27566249468aSthib$frame="%rdx";
27576249468aSthib$context="%r8";
27586249468aSthib$disp="%r9";
27596249468aSthib
27606249468aSthib$code.=<<___;
27616249468aSthib.extern	__imp_RtlVirtualUnwind
27625cdd308eSdjm___
27635cdd308eSdjm$code.=<<___ if ($PREFIX eq "aesni");
27645cdd308eSdjm.type	ecb_se_handler,\@abi-omnipotent
27655cdd308eSdjm.align	16
27665cdd308eSdjmecb_se_handler:
276722787c51Stb	_CET_ENDBR
27685cdd308eSdjm	push	%rsi
27695cdd308eSdjm	push	%rdi
27705cdd308eSdjm	push	%rbx
27715cdd308eSdjm	push	%rbp
27725cdd308eSdjm	push	%r12
27735cdd308eSdjm	push	%r13
27745cdd308eSdjm	push	%r14
27755cdd308eSdjm	push	%r15
27765cdd308eSdjm	pushfq
27775cdd308eSdjm	sub	\$64,%rsp
27785cdd308eSdjm
27795cdd308eSdjm	mov	152($context),%rax	# pull context->Rsp
27805cdd308eSdjm
27815cdd308eSdjm	jmp	.Lcommon_seh_tail
27825cdd308eSdjm.size	ecb_se_handler,.-ecb_se_handler
27835cdd308eSdjm
27845cdd308eSdjm.type	ccm64_se_handler,\@abi-omnipotent
27855cdd308eSdjm.align	16
27865cdd308eSdjmccm64_se_handler:
278722787c51Stb	_CET_ENDBR
27885cdd308eSdjm	push	%rsi
27895cdd308eSdjm	push	%rdi
27905cdd308eSdjm	push	%rbx
27915cdd308eSdjm	push	%rbp
27925cdd308eSdjm	push	%r12
27935cdd308eSdjm	push	%r13
27945cdd308eSdjm	push	%r14
27955cdd308eSdjm	push	%r15
27965cdd308eSdjm	pushfq
27975cdd308eSdjm	sub	\$64,%rsp
27985cdd308eSdjm
27995cdd308eSdjm	mov	120($context),%rax	# pull context->Rax
28005cdd308eSdjm	mov	248($context),%rbx	# pull context->Rip
28015cdd308eSdjm
28025cdd308eSdjm	mov	8($disp),%rsi		# disp->ImageBase
28035cdd308eSdjm	mov	56($disp),%r11		# disp->HandlerData
28045cdd308eSdjm
28055cdd308eSdjm	mov	0(%r11),%r10d		# HandlerData[0]
28065cdd308eSdjm	lea	(%rsi,%r10),%r10	# prologue label
28075cdd308eSdjm	cmp	%r10,%rbx		# context->Rip<prologue label
28085cdd308eSdjm	jb	.Lcommon_seh_tail
28095cdd308eSdjm
28105cdd308eSdjm	mov	152($context),%rax	# pull context->Rsp
28115cdd308eSdjm
28125cdd308eSdjm	mov	4(%r11),%r10d		# HandlerData[1]
28135cdd308eSdjm	lea	(%rsi,%r10),%r10	# epilogue label
28145cdd308eSdjm	cmp	%r10,%rbx		# context->Rip>=epilogue label
28155cdd308eSdjm	jae	.Lcommon_seh_tail
28165cdd308eSdjm
28175cdd308eSdjm	lea	0(%rax),%rsi		# %xmm save area
28185cdd308eSdjm	lea	512($context),%rdi	# &context.Xmm6
28195cdd308eSdjm	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
28205cdd308eSdjm	.long	0xa548f3fc		# cld; rep movsq
28215cdd308eSdjm	lea	0x58(%rax),%rax		# adjust stack pointer
28225cdd308eSdjm
28235cdd308eSdjm	jmp	.Lcommon_seh_tail
28245cdd308eSdjm.size	ccm64_se_handler,.-ccm64_se_handler
28255cdd308eSdjm
28265cdd308eSdjm.type	ctr32_se_handler,\@abi-omnipotent
28275cdd308eSdjm.align	16
28285cdd308eSdjmctr32_se_handler:
282922787c51Stb	_CET_ENDBR
28305cdd308eSdjm	push	%rsi
28315cdd308eSdjm	push	%rdi
28325cdd308eSdjm	push	%rbx
28335cdd308eSdjm	push	%rbp
28345cdd308eSdjm	push	%r12
28355cdd308eSdjm	push	%r13
28365cdd308eSdjm	push	%r14
28375cdd308eSdjm	push	%r15
28385cdd308eSdjm	pushfq
28395cdd308eSdjm	sub	\$64,%rsp
28405cdd308eSdjm
28415cdd308eSdjm	mov	120($context),%rax	# pull context->Rax
28425cdd308eSdjm	mov	248($context),%rbx	# pull context->Rip
28435cdd308eSdjm
28445cdd308eSdjm	lea	.Lctr32_body(%rip),%r10
28455cdd308eSdjm	cmp	%r10,%rbx		# context->Rip<"prologue" label
28465cdd308eSdjm	jb	.Lcommon_seh_tail
28475cdd308eSdjm
28485cdd308eSdjm	mov	152($context),%rax	# pull context->Rsp
28495cdd308eSdjm
28505cdd308eSdjm	lea	.Lctr32_ret(%rip),%r10
28515cdd308eSdjm	cmp	%r10,%rbx
28525cdd308eSdjm	jae	.Lcommon_seh_tail
28535cdd308eSdjm
28545cdd308eSdjm	lea	0x20(%rax),%rsi		# %xmm save area
28555cdd308eSdjm	lea	512($context),%rdi	# &context.Xmm6
28565cdd308eSdjm	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
28575cdd308eSdjm	.long	0xa548f3fc		# cld; rep movsq
28585cdd308eSdjm
28595c104365Sjsing	jmp	.Lcommon_rbp_tail
28605cdd308eSdjm.size	ctr32_se_handler,.-ctr32_se_handler
28615cdd308eSdjm
28625cdd308eSdjm.type	xts_se_handler,\@abi-omnipotent
28635cdd308eSdjm.align	16
28645cdd308eSdjmxts_se_handler:
286522787c51Stb	_CET_ENDBR
28665cdd308eSdjm	push	%rsi
28675cdd308eSdjm	push	%rdi
28685cdd308eSdjm	push	%rbx
28695cdd308eSdjm	push	%rbp
28705cdd308eSdjm	push	%r12
28715cdd308eSdjm	push	%r13
28725cdd308eSdjm	push	%r14
28735cdd308eSdjm	push	%r15
28745cdd308eSdjm	pushfq
28755cdd308eSdjm	sub	\$64,%rsp
28765cdd308eSdjm
28775cdd308eSdjm	mov	120($context),%rax	# pull context->Rax
28785cdd308eSdjm	mov	248($context),%rbx	# pull context->Rip
28795cdd308eSdjm
28805cdd308eSdjm	mov	8($disp),%rsi		# disp->ImageBase
28815cdd308eSdjm	mov	56($disp),%r11		# disp->HandlerData
28825cdd308eSdjm
28835cdd308eSdjm	mov	0(%r11),%r10d		# HandlerData[0]
288471743258Sjmc	lea	(%rsi,%r10),%r10	# prologue label
28855cdd308eSdjm	cmp	%r10,%rbx		# context->Rip<prologue label
28865cdd308eSdjm	jb	.Lcommon_seh_tail
28875cdd308eSdjm
28885cdd308eSdjm	mov	152($context),%rax	# pull context->Rsp
28895cdd308eSdjm
28905cdd308eSdjm	mov	4(%r11),%r10d		# HandlerData[1]
28915cdd308eSdjm	lea	(%rsi,%r10),%r10	# epilogue label
28925cdd308eSdjm	cmp	%r10,%rbx		# context->Rip>=epilogue label
28935cdd308eSdjm	jae	.Lcommon_seh_tail
28945cdd308eSdjm
28955cdd308eSdjm	lea	0x60(%rax),%rsi		# %xmm save area
28965cdd308eSdjm	lea	512($context),%rdi	# & context.Xmm6
28975cdd308eSdjm	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
28985cdd308eSdjm	.long	0xa548f3fc		# cld; rep movsq
28995cdd308eSdjm
29005c104365Sjsing	jmp	.Lcommon_rbp_tail
29015cdd308eSdjm.size	xts_se_handler,.-xts_se_handler
29025cdd308eSdjm___
29035cdd308eSdjm$code.=<<___;
29046249468aSthib.type	cbc_se_handler,\@abi-omnipotent
29056249468aSthib.align	16
29066249468aSthibcbc_se_handler:
290722787c51Stb	_CET_ENDBR
29086249468aSthib	push	%rsi
29096249468aSthib	push	%rdi
29106249468aSthib	push	%rbx
29116249468aSthib	push	%rbp
29126249468aSthib	push	%r12
29136249468aSthib	push	%r13
29146249468aSthib	push	%r14
29156249468aSthib	push	%r15
29166249468aSthib	pushfq
29176249468aSthib	sub	\$64,%rsp
29186249468aSthib
29196249468aSthib	mov	152($context),%rax	# pull context->Rsp
29206249468aSthib	mov	248($context),%rbx	# pull context->Rip
29216249468aSthib
29226249468aSthib	lea	.Lcbc_decrypt(%rip),%r10
29236249468aSthib	cmp	%r10,%rbx		# context->Rip<"prologue" label
29245cdd308eSdjm	jb	.Lcommon_seh_tail
29256249468aSthib
29266249468aSthib	lea	.Lcbc_decrypt_body(%rip),%r10
29276249468aSthib	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
29285cdd308eSdjm	jb	.Lrestore_cbc_rax
29296249468aSthib
29306249468aSthib	lea	.Lcbc_ret(%rip),%r10
29316249468aSthib	cmp	%r10,%rbx		# context->Rip>="epilogue" label
29325cdd308eSdjm	jae	.Lcommon_seh_tail
29336249468aSthib
29345c104365Sjsing	lea	16(%rax),%rsi		# %xmm save area
29356249468aSthib	lea	512($context),%rdi	# &context.Xmm6
29366249468aSthib	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
29376249468aSthib	.long	0xa548f3fc		# cld; rep movsq
29385c104365Sjsing
29395c104365Sjsing.Lcommon_rbp_tail:
29405c104365Sjsing	mov	160($context),%rax	# pull context->Rbp
29415c104365Sjsing	mov	(%rax),%rbp		# restore saved %rbp
29425c104365Sjsing	lea	8(%rax),%rax		# adjust stack pointer
29435c104365Sjsing	mov	%rbp,160($context)	# restore context->Rbp
29445cdd308eSdjm	jmp	.Lcommon_seh_tail
29456249468aSthib
29465cdd308eSdjm.Lrestore_cbc_rax:
29476249468aSthib	mov	120($context),%rax
29485cdd308eSdjm
29495cdd308eSdjm.Lcommon_seh_tail:
29506249468aSthib	mov	8(%rax),%rdi
29516249468aSthib	mov	16(%rax),%rsi
29526249468aSthib	mov	%rax,152($context)	# restore context->Rsp
29536249468aSthib	mov	%rsi,168($context)	# restore context->Rsi
29546249468aSthib	mov	%rdi,176($context)	# restore context->Rdi
29556249468aSthib
29566249468aSthib	mov	40($disp),%rdi		# disp->ContextRecord
29576249468aSthib	mov	$context,%rsi		# context
29586249468aSthib	mov	\$154,%ecx		# sizeof(CONTEXT)
29596249468aSthib	.long	0xa548f3fc		# cld; rep movsq
29606249468aSthib
29616249468aSthib	mov	$disp,%rsi
29626249468aSthib	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
29636249468aSthib	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
29646249468aSthib	mov	0(%rsi),%r8		# arg3, disp->ControlPc
29656249468aSthib	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
29666249468aSthib	mov	40(%rsi),%r10		# disp->ContextRecord
29676249468aSthib	lea	56(%rsi),%r11		# &disp->HandlerData
29686249468aSthib	lea	24(%rsi),%r12		# &disp->EstablisherFrame
29696249468aSthib	mov	%r10,32(%rsp)		# arg5
29706249468aSthib	mov	%r11,40(%rsp)		# arg6
29716249468aSthib	mov	%r12,48(%rsp)		# arg7
29726249468aSthib	mov	%rcx,56(%rsp)		# arg8, (NULL)
29736249468aSthib	call	*__imp_RtlVirtualUnwind(%rip)
29746249468aSthib
29756249468aSthib	mov	\$1,%eax		# ExceptionContinueSearch
29766249468aSthib	add	\$64,%rsp
29776249468aSthib	popfq
29786249468aSthib	pop	%r15
29796249468aSthib	pop	%r14
29806249468aSthib	pop	%r13
29816249468aSthib	pop	%r12
29826249468aSthib	pop	%rbp
29836249468aSthib	pop	%rbx
29846249468aSthib	pop	%rdi
29856249468aSthib	pop	%rsi
29866249468aSthib	ret
29876249468aSthib.size	cbc_se_handler,.-cbc_se_handler
29886249468aSthib
29896249468aSthib.section	.pdata
29906249468aSthib.align	4
29915cdd308eSdjm___
29925cdd308eSdjm$code.=<<___ if ($PREFIX eq "aesni");
29935cdd308eSdjm	.rva	.LSEH_begin_aesni_ecb_encrypt
29945cdd308eSdjm	.rva	.LSEH_end_aesni_ecb_encrypt
29956249468aSthib	.rva	.LSEH_info_ecb
29966249468aSthib
29975cdd308eSdjm	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
29985cdd308eSdjm	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
29995cdd308eSdjm	.rva	.LSEH_info_ccm64_enc
30005cdd308eSdjm
30015cdd308eSdjm	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
30025cdd308eSdjm	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
30035cdd308eSdjm	.rva	.LSEH_info_ccm64_dec
30045cdd308eSdjm
30055cdd308eSdjm	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
30065cdd308eSdjm	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
30075cdd308eSdjm	.rva	.LSEH_info_ctr32
30085cdd308eSdjm
30095cdd308eSdjm	.rva	.LSEH_begin_aesni_xts_encrypt
30105cdd308eSdjm	.rva	.LSEH_end_aesni_xts_encrypt
30115cdd308eSdjm	.rva	.LSEH_info_xts_enc
30125cdd308eSdjm
30135cdd308eSdjm	.rva	.LSEH_begin_aesni_xts_decrypt
30145cdd308eSdjm	.rva	.LSEH_end_aesni_xts_decrypt
30155cdd308eSdjm	.rva	.LSEH_info_xts_dec
30165cdd308eSdjm___
30175cdd308eSdjm$code.=<<___;
30186249468aSthib	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
30196249468aSthib	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
30206249468aSthib	.rva	.LSEH_info_cbc
30216249468aSthib
30226249468aSthib	.rva	${PREFIX}_set_decrypt_key
30236249468aSthib	.rva	.LSEH_end_set_decrypt_key
30246249468aSthib	.rva	.LSEH_info_key
30256249468aSthib
30266249468aSthib	.rva	${PREFIX}_set_encrypt_key
30276249468aSthib	.rva	.LSEH_end_set_encrypt_key
30286249468aSthib	.rva	.LSEH_info_key
30296249468aSthib.section	.xdata
30306249468aSthib.align	8
30315cdd308eSdjm___
30325cdd308eSdjm$code.=<<___ if ($PREFIX eq "aesni");
30336249468aSthib.LSEH_info_ecb:
30346249468aSthib	.byte	9,0,0,0
30356249468aSthib	.rva	ecb_se_handler
30365cdd308eSdjm.LSEH_info_ccm64_enc:
30375cdd308eSdjm	.byte	9,0,0,0
30385cdd308eSdjm	.rva	ccm64_se_handler
30395cdd308eSdjm	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
30405cdd308eSdjm.LSEH_info_ccm64_dec:
30415cdd308eSdjm	.byte	9,0,0,0
30425cdd308eSdjm	.rva	ccm64_se_handler
30435cdd308eSdjm	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
30445cdd308eSdjm.LSEH_info_ctr32:
30455cdd308eSdjm	.byte	9,0,0,0
30465cdd308eSdjm	.rva	ctr32_se_handler
30475cdd308eSdjm.LSEH_info_xts_enc:
30485cdd308eSdjm	.byte	9,0,0,0
30495cdd308eSdjm	.rva	xts_se_handler
30505cdd308eSdjm	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
30515cdd308eSdjm.LSEH_info_xts_dec:
30525cdd308eSdjm	.byte	9,0,0,0
30535cdd308eSdjm	.rva	xts_se_handler
30545cdd308eSdjm	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
30555cdd308eSdjm___
30565cdd308eSdjm$code.=<<___;
30576249468aSthib.LSEH_info_cbc:
30586249468aSthib	.byte	9,0,0,0
30596249468aSthib	.rva	cbc_se_handler
30606249468aSthib.LSEH_info_key:
30616249468aSthib	.byte	0x01,0x04,0x01,0x00
30625cdd308eSdjm	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
30636249468aSthib___
30646249468aSthib}
30656249468aSthib
30666249468aSthibsub rex {
30676249468aSthib  local *opcode=shift;
30686249468aSthib  my ($dst,$src)=@_;
30695cdd308eSdjm  my $rex=0;
30706249468aSthib
30716249468aSthib    $rex|=0x04			if($dst>=8);
30726249468aSthib    $rex|=0x01			if($src>=8);
30735cdd308eSdjm    push @opcode,$rex|0x40	if($rex);
30746249468aSthib}
30756249468aSthib
30766249468aSthib$code =~ s/\`([^\`]*)\`/eval($1)/gem;
30776249468aSthib
30786249468aSthibprint $code;
30796249468aSthib
30806249468aSthibclose STDOUT;
3081