xref: /openbsd/lib/libcrypto/aes/asm/bsaes-x86_64.pl (revision 22787c51)
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode]				###
5### bitsliced implementation for Intel Core 2 processors	###
6### requires support of SSE extensions up to SSSE3		###
7### Author: Emilia Käsper and Peter Schwabe			###
8### Date: 2009-03-19						###
9### Public domain						###
10###								###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
12### further information.					###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22#   from 12.5KB to 2.2KB;
23# - above was possible thanks to mixcolumns() modification that
24#   allowed to feed its output back to aesenc[last], this was
25#   achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28#   relies on conversion of "conventional" key schedule as returned
29#   by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31#   to skip one shiftrows(), reduce bit-sliced key schedule and
32#   speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38#		Emilia's	this(*)		difference
39#
40# Core 2    	9.30		8.69		+7%
41# Nehalem(**) 	7.63		6.98		+9%
42# Atom	    	17.1		17.4		-2%(***)
43#
44# (*)	Comparison is not completely fair, because "this" is ECB,
45#	i.e. no extra processing such as counter values calculation
46#	and xor-ing input as in Emilia's CTR implementation is
47#	performed. However, the CTR calculations stand for not more
48#	than 1% of total time, so comparison is *rather* fair.
49#
50# (**)	Results were collected on Westmere, which is considered to
51#	be equivalent to Nehalem for this code.
52#
53# (***)	Slowdown on Atom is rather strange per se, because original
54#	implementation has a number of 9+-bytes instructions, which
55#	are bad for Atom front-end, and which I eliminated completely.
56#	In attempt to address deterioration sbox() was tested in FP
57#	SIMD "domain" (movaps instead of movdqa, xorps instead of
58#	pxor, etc.). While it resulted in nominal 4% improvement on
59#	Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# 		conversion	conversion/8x block
68# Core 2	240		0.22
69# Nehalem	180		0.20
70# Atom		430		0.19
71#
72# The ratio values mean that 128-byte blocks will be processed
73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2	9.83
87# Nehalem	7.74
88# Atom		19.0
89#
90# November 2011.
91#
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
94#
95#						<appro@openssl.org>
96
97$flavour = shift;
98$output  = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108open OUT,"| \"$^X\" $xlate $flavour $output";
109*STDOUT=*OUT;
110
111my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
113my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
114
115{
116my ($key,$rounds,$const)=("%rax","%r10d","%r11");
117
118sub Sbox {
119# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
121my @b=@_[0..7];
122my @t=@_[8..11];
123my @s=@_[12..15];
124	&InBasisChange	(@b);
125	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
126	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
127}
128
129sub InBasisChange {
130# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
132my @b=@_[0..7];
133$code.=<<___;
134	pxor	@b[6], @b[5]
135	pxor	@b[1], @b[2]
136	pxor	@b[0], @b[3]
137	pxor	@b[2], @b[6]
138	pxor 	@b[0], @b[5]
139
140	pxor	@b[3], @b[6]
141	pxor	@b[7], @b[3]
142	pxor	@b[5], @b[7]
143	pxor	@b[4], @b[3]
144	pxor	@b[5], @b[4]
145	pxor	@b[1], @b[3]
146
147	pxor	@b[7], @b[2]
148	pxor	@b[5], @b[1]
149___
150}
151
152sub OutBasisChange {
153# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
155my @b=@_[0..7];
156$code.=<<___;
157	pxor	@b[6], @b[0]
158	pxor	@b[4], @b[1]
159	pxor	@b[0], @b[2]
160	pxor	@b[6], @b[4]
161	pxor	@b[1], @b[6]
162
163	pxor	@b[5], @b[1]
164	pxor	@b[3], @b[5]
165	pxor	@b[7], @b[3]
166	pxor	@b[5], @b[7]
167	pxor	@b[5], @b[2]
168
169	pxor	@b[7], @b[4]
170___
171}
172
173sub InvSbox {
174# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
176my @b=@_[0..7];
177my @t=@_[8..11];
178my @s=@_[12..15];
179	&InvInBasisChange	(@b);
180	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
181	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
182}
183
184sub InvInBasisChange {		# OutBasisChange in reverse
185my @b=@_[5,1,2,6,3,7,0,4];
186$code.=<<___
187	pxor	@b[7], @b[4]
188
189	pxor	@b[5], @b[7]
190	pxor	@b[5], @b[2]
191	pxor	@b[7], @b[3]
192	pxor	@b[3], @b[5]
193	pxor	@b[5], @b[1]
194
195	pxor	@b[1], @b[6]
196	pxor	@b[0], @b[2]
197	pxor	@b[6], @b[4]
198	pxor	@b[6], @b[0]
199	pxor	@b[4], @b[1]
200___
201}
202
203sub InvOutBasisChange {		# InBasisChange in reverse
204my @b=@_[2,5,7,3,6,1,0,4];
205$code.=<<___;
206	pxor	@b[5], @b[1]
207	pxor	@b[7], @b[2]
208
209	pxor	@b[1], @b[3]
210	pxor	@b[5], @b[4]
211	pxor	@b[5], @b[7]
212	pxor	@b[4], @b[3]
213	 pxor 	@b[0], @b[5]
214	pxor	@b[7], @b[3]
215	 pxor	@b[2], @b[6]
216	 pxor	@b[1], @b[2]
217	pxor	@b[3], @b[6]
218
219	pxor	@b[0], @b[3]
220	pxor	@b[6], @b[5]
221___
222}
223
224sub Mul_GF4 {
225#;*************************************************************
226#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227#;*************************************************************
228my ($x0,$x1,$y0,$y1,$t0)=@_;
229$code.=<<___;
230	movdqa	$y0, $t0
231	pxor 	$y1, $t0
232	pand	$x0, $t0
233	pxor	$x1, $x0
234	pand	$y0, $x1
235	pand	$y1, $x0
236	pxor	$x1, $x0
237	pxor	$t0, $x1
238___
239}
240
241sub Mul_GF4_N {				# not used, see next subroutine
242# multiply and scale by N
243my ($x0,$x1,$y0,$y1,$t0)=@_;
244$code.=<<___;
245	movdqa	$y0, $t0
246	pxor	$y1, $t0
247	pand	$x0, $t0
248	pxor	$x1, $x0
249	pand	$y0, $x1
250	pand	$y1, $x0
251	pxor	$x0, $x1
252	pxor	$t0, $x0
253___
254}
255
256sub Mul_GF4_N_GF4 {
257# interleaved Mul_GF4_N and Mul_GF4
258my ($x0,$x1,$y0,$y1,$t0,
259    $x2,$x3,$y2,$y3,$t1)=@_;
260$code.=<<___;
261	movdqa	$y0, $t0
262	 movdqa	$y2, $t1
263	pxor	$y1, $t0
264	 pxor 	$y3, $t1
265	pand	$x0, $t0
266	 pand	$x2, $t1
267	pxor	$x1, $x0
268	 pxor	$x3, $x2
269	pand	$y0, $x1
270	 pand	$y2, $x3
271	pand	$y1, $x0
272	 pand	$y3, $x2
273	pxor	$x0, $x1
274	 pxor	$x3, $x2
275	pxor	$t0, $x0
276	 pxor	$t1, $x3
277___
278}
279sub Mul_GF16_2 {
280my @x=@_[0..7];
281my @y=@_[8..11];
282my @t=@_[12..15];
283$code.=<<___;
284	movdqa	@x[0], @t[0]
285	movdqa	@x[1], @t[1]
286___
287	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
288$code.=<<___;
289	pxor	@x[2], @t[0]
290	pxor	@x[3], @t[1]
291	pxor	@y[2], @y[0]
292	pxor	@y[3], @y[1]
293___
294	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
295			 @x[2], @x[3], @y[2], @y[3], @t[2]);
296$code.=<<___;
297	pxor	@t[0], @x[0]
298	pxor	@t[0], @x[2]
299	pxor	@t[1], @x[1]
300	pxor	@t[1], @x[3]
301
302	movdqa	@x[4], @t[0]
303	movdqa	@x[5], @t[1]
304	pxor	@x[6], @t[0]
305	pxor	@x[7], @t[1]
306___
307	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
308			 @x[6], @x[7], @y[2], @y[3], @t[2]);
309$code.=<<___;
310	pxor	@y[2], @y[0]
311	pxor	@y[3], @y[1]
312___
313	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
314$code.=<<___;
315	pxor	@t[0], @x[4]
316	pxor	@t[0], @x[6]
317	pxor	@t[1], @x[5]
318	pxor	@t[1], @x[7]
319___
320}
321sub Inv_GF256 {
322#;********************************************************************
323#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
324#;********************************************************************
325my @x=@_[0..7];
326my @t=@_[8..11];
327my @s=@_[12..15];
328# direct optimizations from hardware
329$code.=<<___;
330	movdqa	@x[4], @t[3]
331	movdqa	@x[5], @t[2]
332	movdqa	@x[1], @t[1]
333	movdqa	@x[7], @s[1]
334	movdqa	@x[0], @s[0]
335
336	pxor	@x[6], @t[3]
337	pxor	@x[7], @t[2]
338	pxor	@x[3], @t[1]
339	 movdqa	@t[3], @s[2]
340	pxor	@x[6], @s[1]
341	 movdqa	@t[2], @t[0]
342	pxor	@x[2], @s[0]
343	 movdqa	@t[3], @s[3]
344
345	por	@t[1], @t[2]
346	por	@s[0], @t[3]
347	pxor	@t[0], @s[3]
348	pand	@s[0], @s[2]
349	pxor	@t[1], @s[0]
350	pand	@t[1], @t[0]
351	pand	@s[0], @s[3]
352	movdqa	@x[3], @s[0]
353	pxor	@x[2], @s[0]
354	pand	@s[0], @s[1]
355	pxor	@s[1], @t[3]
356	pxor	@s[1], @t[2]
357	movdqa	@x[4], @s[1]
358	movdqa	@x[1], @s[0]
359	pxor	@x[5], @s[1]
360	pxor	@x[0], @s[0]
361	movdqa	@s[1], @t[1]
362	pand	@s[0], @s[1]
363	por	@s[0], @t[1]
364	pxor	@s[1], @t[0]
365	pxor	@s[3], @t[3]
366	pxor	@s[2], @t[2]
367	pxor	@s[3], @t[1]
368	movdqa	@x[7], @s[0]
369	pxor	@s[2], @t[0]
370	movdqa	@x[6], @s[1]
371	pxor	@s[2], @t[1]
372	movdqa	@x[5], @s[2]
373	pand	@x[3], @s[0]
374	movdqa	@x[4], @s[3]
375	pand	@x[2], @s[1]
376	pand	@x[1], @s[2]
377	por	@x[0], @s[3]
378	pxor	@s[0], @t[3]
379	pxor	@s[1], @t[2]
380	pxor	@s[2], @t[1]
381	pxor	@s[3], @t[0]
382
383	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
384
385	# new smaller inversion
386
387	movdqa	@t[3], @s[0]
388	pand	@t[1], @t[3]
389	pxor	@t[2], @s[0]
390
391	movdqa	@t[0], @s[2]
392	movdqa	@s[0], @s[3]
393	pxor	@t[3], @s[2]
394	pand	@s[2], @s[3]
395
396	movdqa	@t[1], @s[1]
397	pxor	@t[2], @s[3]
398	pxor	@t[0], @s[1]
399
400	pxor	@t[2], @t[3]
401
402	pand	@t[3], @s[1]
403
404	movdqa	@s[2], @t[2]
405	pxor	@t[0], @s[1]
406
407	pxor	@s[1], @t[2]
408	pxor	@s[1], @t[1]
409
410	pand	@t[0], @t[2]
411
412	pxor	@t[2], @s[2]
413	pxor	@t[2], @t[1]
414
415	pand	@s[3], @s[2]
416
417	pxor	@s[0], @s[2]
418___
419# output in s3, s2, s1, t1
420
421# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
422
423# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
425
426### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
427}
428
429# AES linear components
430
431sub ShiftRows {
432my @x=@_[0..7];
433my $mask=pop;
434$code.=<<___;
435	pxor	0x00($key),@x[0]
436	pxor	0x10($key),@x[1]
437	pshufb	$mask,@x[0]
438	pxor	0x20($key),@x[2]
439	pshufb	$mask,@x[1]
440	pxor	0x30($key),@x[3]
441	pshufb	$mask,@x[2]
442	pxor	0x40($key),@x[4]
443	pshufb	$mask,@x[3]
444	pxor	0x50($key),@x[5]
445	pshufb	$mask,@x[4]
446	pxor	0x60($key),@x[6]
447	pshufb	$mask,@x[5]
448	pxor	0x70($key),@x[7]
449	pshufb	$mask,@x[6]
450	lea	0x80($key),$key
451	pshufb	$mask,@x[7]
452___
453}
454
455sub MixColumns {
456# modified to emit output in order suitable for feeding back to aesenc[last]
457my @x=@_[0..7];
458my @t=@_[8..15];
459my $inv=@_[16];	# optional
460$code.=<<___;
461	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
462	pshufd	\$0x93, @x[1], @t[1]
463	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
464	pshufd	\$0x93, @x[2], @t[2]
465	 pxor	@t[1], @x[1]
466	pshufd	\$0x93, @x[3], @t[3]
467	 pxor	@t[2], @x[2]
468	pshufd	\$0x93, @x[4], @t[4]
469	 pxor	@t[3], @x[3]
470	pshufd	\$0x93, @x[5], @t[5]
471	 pxor	@t[4], @x[4]
472	pshufd	\$0x93, @x[6], @t[6]
473	 pxor	@t[5], @x[5]
474	pshufd	\$0x93, @x[7], @t[7]
475	 pxor	@t[6], @x[6]
476	 pxor	@t[7], @x[7]
477
478	pxor	@x[0], @t[1]
479	pxor	@x[7], @t[0]
480	pxor	@x[7], @t[1]
481	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
482	pxor	@x[1], @t[2]
483	 pshufd	\$0x4E, @x[1], @x[1]
484	pxor	@x[4], @t[5]
485	 pxor	@t[0], @x[0]
486	pxor	@x[5], @t[6]
487	 pxor	@t[1], @x[1]
488	pxor	@x[3], @t[4]
489	 pshufd	\$0x4E, @x[4], @t[0]
490	pxor	@x[6], @t[7]
491	 pshufd	\$0x4E, @x[5], @t[1]
492	pxor	@x[2], @t[3]
493	 pshufd	\$0x4E, @x[3], @x[4]
494	pxor	@x[7], @t[3]
495	 pshufd	\$0x4E, @x[7], @x[5]
496	pxor	@x[7], @t[4]
497	 pshufd	\$0x4E, @x[6], @x[3]
498	pxor	@t[4], @t[0]
499	 pshufd	\$0x4E, @x[2], @x[6]
500	pxor	@t[5], @t[1]
501___
502$code.=<<___ if (!$inv);
503	pxor	@t[3], @x[4]
504	pxor	@t[7], @x[5]
505	pxor	@t[6], @x[3]
506	 movdqa	@t[0], @x[2]
507	pxor	@t[2], @x[6]
508	 movdqa	@t[1], @x[7]
509___
510$code.=<<___ if ($inv);
511	pxor	@x[4], @t[3]
512	pxor	@t[7], @x[5]
513	pxor	@x[3], @t[6]
514	 movdqa	@t[0], @x[3]
515	pxor	@t[2], @x[6]
516	 movdqa	@t[6], @x[2]
517	 movdqa	@t[1], @x[7]
518	 movdqa	@x[6], @x[4]
519	 movdqa	@t[3], @x[6]
520___
521}
522
523sub InvMixColumns_orig {
524my @x=@_[0..7];
525my @t=@_[8..15];
526
527$code.=<<___;
528	# multiplication by 0x0e
529	pshufd	\$0x93, @x[7], @t[7]
530	movdqa	@x[2], @t[2]
531	pxor	@x[5], @x[7]		# 7 5
532	pxor	@x[5], @x[2]		# 2 5
533	pshufd	\$0x93, @x[0], @t[0]
534	movdqa	@x[5], @t[5]
535	pxor	@x[0], @x[5]		# 5 0		[1]
536	pxor	@x[1], @x[0]		# 0 1
537	pshufd	\$0x93, @x[1], @t[1]
538	pxor	@x[2], @x[1]		# 1 25
539	pxor	@x[6], @x[0]		# 01 6		[2]
540	pxor	@x[3], @x[1]		# 125 3		[4]
541	pshufd	\$0x93, @x[3], @t[3]
542	pxor	@x[0], @x[2]		# 25 016	[3]
543	pxor	@x[7], @x[3]		# 3 75
544	pxor	@x[6], @x[7]		# 75 6		[0]
545	pshufd	\$0x93, @x[6], @t[6]
546	movdqa	@x[4], @t[4]
547	pxor	@x[4], @x[6]		# 6 4
548	pxor	@x[3], @x[4]		# 4 375		[6]
549	pxor	@x[7], @x[3]		# 375 756=36
550	pxor	@t[5], @x[6]		# 64 5		[7]
551	pxor	@t[2], @x[3]		# 36 2
552	pxor	@t[4], @x[3]		# 362 4		[5]
553	pshufd	\$0x93, @t[5], @t[5]
554___
555					my @y = @x[7,5,0,2,1,3,4,6];
556$code.=<<___;
557	# multiplication by 0x0b
558	pxor	@y[0], @y[1]
559	pxor	@t[0], @y[0]
560	pxor	@t[1], @y[1]
561	pshufd	\$0x93, @t[2], @t[2]
562	pxor	@t[5], @y[0]
563	pxor	@t[6], @y[1]
564	pxor	@t[7], @y[0]
565	pshufd	\$0x93, @t[4], @t[4]
566	pxor	@t[6], @t[7]		# clobber t[7]
567	pxor	@y[0], @y[1]
568
569	pxor	@t[0], @y[3]
570	pshufd	\$0x93, @t[0], @t[0]
571	pxor	@t[1], @y[2]
572	pxor	@t[1], @y[4]
573	pxor	@t[2], @y[2]
574	pshufd	\$0x93, @t[1], @t[1]
575	pxor	@t[2], @y[3]
576	pxor	@t[2], @y[5]
577	pxor	@t[7], @y[2]
578	pshufd	\$0x93, @t[2], @t[2]
579	pxor	@t[3], @y[3]
580	pxor	@t[3], @y[6]
581	pxor	@t[3], @y[4]
582	pshufd	\$0x93, @t[3], @t[3]
583	pxor	@t[4], @y[7]
584	pxor	@t[4], @y[5]
585	pxor	@t[7], @y[7]
586	pxor	@t[5], @y[3]
587	pxor	@t[4], @y[4]
588	pxor	@t[5], @t[7]		# clobber t[7] even more
589
590	pxor	@t[7], @y[5]
591	pshufd	\$0x93, @t[4], @t[4]
592	pxor	@t[7], @y[6]
593	pxor	@t[7], @y[4]
594
595	pxor	@t[5], @t[7]
596	pshufd	\$0x93, @t[5], @t[5]
597	pxor	@t[6], @t[7]		# restore t[7]
598
599	# multiplication by 0x0d
600	pxor	@y[7], @y[4]
601	pxor	@t[4], @y[7]
602	pshufd	\$0x93, @t[6], @t[6]
603	pxor	@t[0], @y[2]
604	pxor	@t[5], @y[7]
605	pxor	@t[2], @y[2]
606	pshufd	\$0x93, @t[7], @t[7]
607
608	pxor	@y[1], @y[3]
609	pxor	@t[1], @y[1]
610	pxor	@t[0], @y[0]
611	pxor	@t[0], @y[3]
612	pxor	@t[5], @y[1]
613	pxor	@t[5], @y[0]
614	pxor	@t[7], @y[1]
615	pshufd	\$0x93, @t[0], @t[0]
616	pxor	@t[6], @y[0]
617	pxor	@y[1], @y[3]
618	pxor	@t[1], @y[4]
619	pshufd	\$0x93, @t[1], @t[1]
620
621	pxor	@t[7], @y[7]
622	pxor	@t[2], @y[4]
623	pxor	@t[2], @y[5]
624	pshufd	\$0x93, @t[2], @t[2]
625	pxor	@t[6], @y[2]
626	pxor	@t[3], @t[6]		# clobber t[6]
627	pxor	@y[7], @y[4]
628	pxor	@t[6], @y[3]
629
630	pxor	@t[6], @y[6]
631	pxor	@t[5], @y[5]
632	pxor	@t[4], @y[6]
633	pshufd	\$0x93, @t[4], @t[4]
634	pxor	@t[6], @y[5]
635	pxor	@t[7], @y[6]
636	pxor	@t[3], @t[6]		# restore t[6]
637
638	pshufd	\$0x93, @t[5], @t[5]
639	pshufd	\$0x93, @t[6], @t[6]
640	pshufd	\$0x93, @t[7], @t[7]
641	pshufd	\$0x93, @t[3], @t[3]
642
643	# multiplication by 0x09
644	pxor	@y[1], @y[4]
645	pxor	@y[1], @t[1]		# t[1]=y[1]
646	pxor	@t[5], @t[0]		# clobber t[0]
647	pxor	@t[5], @t[1]
648	pxor	@t[0], @y[3]
649	pxor	@y[0], @t[0]		# t[0]=y[0]
650	pxor	@t[6], @t[1]
651	pxor	@t[7], @t[6]		# clobber t[6]
652	pxor	@t[1], @y[4]
653	pxor	@t[4], @y[7]
654	pxor	@y[4], @t[4]		# t[4]=y[4]
655	pxor	@t[3], @y[6]
656	pxor	@y[3], @t[3]		# t[3]=y[3]
657	pxor	@t[2], @y[5]
658	pxor	@y[2], @t[2]		# t[2]=y[2]
659	pxor	@t[7], @t[3]
660	pxor	@y[5], @t[5]		# t[5]=y[5]
661	pxor	@t[6], @t[2]
662	pxor	@t[6], @t[5]
663	pxor	@y[6], @t[6]		# t[6]=y[6]
664	pxor	@y[7], @t[7]		# t[7]=y[7]
665
666	movdqa	@t[0],@XMM[0]
667	movdqa	@t[1],@XMM[1]
668	movdqa	@t[2],@XMM[2]
669	movdqa	@t[3],@XMM[3]
670	movdqa	@t[4],@XMM[4]
671	movdqa	@t[5],@XMM[5]
672	movdqa	@t[6],@XMM[6]
673	movdqa	@t[7],@XMM[7]
674___
675}
676
677sub InvMixColumns {
678my @x=@_[0..7];
679my @t=@_[8..15];
680
681# Thanks to Jussi Kivilinna for providing pointer to
682#
683# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
684# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
685# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
686# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
687
688$code.=<<___;
689	# multiplication by 0x05-0x00-0x04-0x00
690	pshufd	\$0x4E, @x[0], @t[0]
691	pshufd	\$0x4E, @x[6], @t[6]
692	pxor	@x[0], @t[0]
693	pshufd	\$0x4E, @x[7], @t[7]
694	pxor	@x[6], @t[6]
695	pshufd	\$0x4E, @x[1], @t[1]
696	pxor	@x[7], @t[7]
697	pshufd	\$0x4E, @x[2], @t[2]
698	pxor	@x[1], @t[1]
699	pshufd	\$0x4E, @x[3], @t[3]
700	pxor	@x[2], @t[2]
701	 pxor	@t[6], @x[0]
702	 pxor	@t[6], @x[1]
703	pshufd	\$0x4E, @x[4], @t[4]
704	pxor	@x[3], @t[3]
705	 pxor	@t[0], @x[2]
706	 pxor	@t[1], @x[3]
707	pshufd	\$0x4E, @x[5], @t[5]
708	pxor	@x[4], @t[4]
709	 pxor	@t[7], @x[1]
710	 pxor	@t[2], @x[4]
711	pxor	@x[5], @t[5]
712
713	 pxor	@t[7], @x[2]
714	 pxor	@t[6], @x[3]
715	 pxor	@t[6], @x[4]
716	 pxor	@t[3], @x[5]
717	 pxor	@t[4], @x[6]
718	 pxor	@t[7], @x[4]
719	 pxor	@t[7], @x[5]
720	 pxor	@t[5], @x[7]
721___
722	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
723}
724
725sub aesenc {				# not used
726my @b=@_[0..7];
727my @t=@_[8..15];
728$code.=<<___;
729	movdqa	0x30($const),@t[0]	# .LSR
730___
731	&ShiftRows	(@b,@t[0]);
732	&Sbox		(@b,@t);
733	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
734}
735
736sub aesenclast {			# not used
737my @b=@_[0..7];
738my @t=@_[8..15];
739$code.=<<___;
740	movdqa	0x40($const),@t[0]	# .LSRM0
741___
742	&ShiftRows	(@b,@t[0]);
743	&Sbox		(@b,@t);
744$code.=<<___
745	pxor	0x00($key),@b[0]
746	pxor	0x10($key),@b[1]
747	pxor	0x20($key),@b[4]
748	pxor	0x30($key),@b[6]
749	pxor	0x40($key),@b[3]
750	pxor	0x50($key),@b[7]
751	pxor	0x60($key),@b[2]
752	pxor	0x70($key),@b[5]
753___
754}
755
756sub swapmove {
757my ($a,$b,$n,$mask,$t)=@_;
758$code.=<<___;
759	movdqa	$b,$t
760	psrlq	\$$n,$b
761	pxor  	$a,$b
762	pand	$mask,$b
763	pxor	$b,$a
764	psllq	\$$n,$b
765	pxor	$t,$b
766___
767}
768sub swapmove2x {
769my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
770$code.=<<___;
771	movdqa	$b0,$t0
772	psrlq	\$$n,$b0
773	 movdqa	$b1,$t1
774	 psrlq	\$$n,$b1
775	pxor  	$a0,$b0
776	 pxor  	$a1,$b1
777	pand	$mask,$b0
778	 pand	$mask,$b1
779	pxor	$b0,$a0
780	psllq	\$$n,$b0
781	 pxor	$b1,$a1
782	 psllq	\$$n,$b1
783	pxor	$t0,$b0
784	 pxor	$t1,$b1
785___
786}
787
788sub bitslice {
789my @x=reverse(@_[0..7]);
790my ($t0,$t1,$t2,$t3)=@_[8..11];
791$code.=<<___;
792	movdqa	0x00($const),$t0	# .LBS0
793	movdqa	0x10($const),$t1	# .LBS1
794___
795	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
796	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
797$code.=<<___;
798	movdqa	0x20($const),$t0	# .LBS2
799___
800	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
801	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
802
803	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
804	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
805}
806
807$code.=<<___;
808.text
809
810.extern	asm_AES_encrypt
811.extern	asm_AES_decrypt
812
813.type	_bsaes_encrypt8,\@abi-omnipotent
814.align	64
815_bsaes_encrypt8:
816	_CET_ENDBR
817	lea	.LBS0(%rip), $const	# constants table
818
819	movdqa	($key), @XMM[9]		# round 0 key
820	lea	0x10($key), $key
821	movdqa	0x50($const), @XMM[8]	# .LM0SR
822	pxor	@XMM[9], @XMM[0]	# xor with round0 key
823	pxor	@XMM[9], @XMM[1]
824	 pshufb	@XMM[8], @XMM[0]
825	pxor	@XMM[9], @XMM[2]
826	 pshufb	@XMM[8], @XMM[1]
827	pxor	@XMM[9], @XMM[3]
828	 pshufb	@XMM[8], @XMM[2]
829	pxor	@XMM[9], @XMM[4]
830	 pshufb	@XMM[8], @XMM[3]
831	pxor	@XMM[9], @XMM[5]
832	 pshufb	@XMM[8], @XMM[4]
833	pxor	@XMM[9], @XMM[6]
834	 pshufb	@XMM[8], @XMM[5]
835	pxor	@XMM[9], @XMM[7]
836	 pshufb	@XMM[8], @XMM[6]
837	 pshufb	@XMM[8], @XMM[7]
838_bsaes_encrypt8_bitslice:
839___
840	&bitslice	(@XMM[0..7, 8..11]);
841$code.=<<___;
842	dec	$rounds
843	jmp	.Lenc_sbox
844.align	16
845.Lenc_loop:
846___
847	&ShiftRows	(@XMM[0..7, 8]);
848$code.=".Lenc_sbox:\n";
849	&Sbox		(@XMM[0..7, 8..15]);
850$code.=<<___;
851	dec	$rounds
852	jl	.Lenc_done
853___
854	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
855$code.=<<___;
856	movdqa	0x30($const), @XMM[8]	# .LSR
857	jnz	.Lenc_loop
858	movdqa	0x40($const), @XMM[8]	# .LSRM0
859	jmp	.Lenc_loop
860.align	16
861.Lenc_done:
862___
863	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
864	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
865$code.=<<___;
866	movdqa	($key), @XMM[8]		# last round key
867	pxor	@XMM[8], @XMM[4]
868	pxor	@XMM[8], @XMM[6]
869	pxor	@XMM[8], @XMM[3]
870	pxor	@XMM[8], @XMM[7]
871	pxor	@XMM[8], @XMM[2]
872	pxor	@XMM[8], @XMM[5]
873	pxor	@XMM[8], @XMM[0]
874	pxor	@XMM[8], @XMM[1]
875	ret
876.size	_bsaes_encrypt8,.-_bsaes_encrypt8
877
878.type	_bsaes_decrypt8,\@abi-omnipotent
879.align	64
880_bsaes_decrypt8:
881	_CET_ENDBR
882	lea	.LBS0(%rip), $const	# constants table
883
884	movdqa	($key), @XMM[9]		# round 0 key
885	lea	0x10($key), $key
886	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
887	pxor	@XMM[9], @XMM[0]	# xor with round0 key
888	pxor	@XMM[9], @XMM[1]
889	 pshufb	@XMM[8], @XMM[0]
890	pxor	@XMM[9], @XMM[2]
891	 pshufb	@XMM[8], @XMM[1]
892	pxor	@XMM[9], @XMM[3]
893	 pshufb	@XMM[8], @XMM[2]
894	pxor	@XMM[9], @XMM[4]
895	 pshufb	@XMM[8], @XMM[3]
896	pxor	@XMM[9], @XMM[5]
897	 pshufb	@XMM[8], @XMM[4]
898	pxor	@XMM[9], @XMM[6]
899	 pshufb	@XMM[8], @XMM[5]
900	pxor	@XMM[9], @XMM[7]
901	 pshufb	@XMM[8], @XMM[6]
902	 pshufb	@XMM[8], @XMM[7]
903___
904	&bitslice	(@XMM[0..7, 8..11]);
905$code.=<<___;
906	dec	$rounds
907	jmp	.Ldec_sbox
908.align	16
909.Ldec_loop:
910___
911	&ShiftRows	(@XMM[0..7, 8]);
912$code.=".Ldec_sbox:\n";
913	&InvSbox	(@XMM[0..7, 8..15]);
914$code.=<<___;
915	dec	$rounds
916	jl	.Ldec_done
917___
918	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
919$code.=<<___;
920	movdqa	-0x10($const), @XMM[8]	# .LISR
921	jnz	.Ldec_loop
922	movdqa	-0x20($const), @XMM[8]	# .LISRM0
923	jmp	.Ldec_loop
924.align	16
925.Ldec_done:
926___
927	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
928$code.=<<___;
929	movdqa	($key), @XMM[8]		# last round key
930	pxor	@XMM[8], @XMM[6]
931	pxor	@XMM[8], @XMM[4]
932	pxor	@XMM[8], @XMM[2]
933	pxor	@XMM[8], @XMM[7]
934	pxor	@XMM[8], @XMM[3]
935	pxor	@XMM[8], @XMM[5]
936	pxor	@XMM[8], @XMM[0]
937	pxor	@XMM[8], @XMM[1]
938	ret
939.size	_bsaes_decrypt8,.-_bsaes_decrypt8
940___
941}
942{
943my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
944
945sub bitslice_key {
946my @x=reverse(@_[0..7]);
947my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
948
949	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
950$code.=<<___;
951	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
952	movdqa	@x[0], @x[2]
953	movdqa	@x[1], @x[3]
954___
955	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
956
957	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
958$code.=<<___;
959	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
960	movdqa	@x[0], @x[4]
961	movdqa	@x[2], @x[6]
962	movdqa	@x[1], @x[5]
963	movdqa	@x[3], @x[7]
964___
965	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
966	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
967}
968
969$code.=<<___;
970.type	_bsaes_key_convert,\@abi-omnipotent
971.align	16
972_bsaes_key_convert:
973	_CET_ENDBR
974	lea	.Lmasks(%rip), $const
975	movdqu	($inp), %xmm7		# load round 0 key
976	lea	0x10($inp), $inp
977	movdqa	0x00($const), %xmm0	# 0x01...
978	movdqa	0x10($const), %xmm1	# 0x02...
979	movdqa	0x20($const), %xmm2	# 0x04...
980	movdqa	0x30($const), %xmm3	# 0x08...
981	movdqa	0x40($const), %xmm4	# .LM0
982	pcmpeqd	%xmm5, %xmm5		# .LNOT
983
984	movdqu	($inp), %xmm6		# load round 1 key
985	movdqa	%xmm7, ($out)		# save round 0 key
986	lea	0x10($out), $out
987	dec	$rounds
988	jmp	.Lkey_loop
989.align	16
990.Lkey_loop:
991	pshufb	%xmm4, %xmm6		# .LM0
992
993	movdqa	%xmm0,	%xmm8
994	movdqa	%xmm1,	%xmm9
995
996	pand	%xmm6,	%xmm8
997	pand	%xmm6,	%xmm9
998	movdqa	%xmm2,	%xmm10
999	pcmpeqb	%xmm0,	%xmm8
1000	psllq	\$4,	%xmm0		# 0x10...
1001	movdqa	%xmm3,	%xmm11
1002	pcmpeqb	%xmm1,	%xmm9
1003	psllq	\$4,	%xmm1		# 0x20...
1004
1005	pand	%xmm6,	%xmm10
1006	pand	%xmm6,	%xmm11
1007	movdqa	%xmm0,	%xmm12
1008	pcmpeqb	%xmm2,	%xmm10
1009	psllq	\$4,	%xmm2		# 0x40...
1010	movdqa	%xmm1,	%xmm13
1011	pcmpeqb	%xmm3,	%xmm11
1012	psllq	\$4,	%xmm3		# 0x80...
1013
1014	movdqa	%xmm2,	%xmm14
1015	movdqa	%xmm3,	%xmm15
1016	 pxor	%xmm5,	%xmm8		# "pnot"
1017	 pxor	%xmm5,	%xmm9
1018
1019	pand	%xmm6,	%xmm12
1020	pand	%xmm6,	%xmm13
1021	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
1022	pcmpeqb	%xmm0,	%xmm12
1023	psrlq	\$4,	%xmm0		# 0x01...
1024	 movdqa	%xmm9, 0x10($out)
1025	pcmpeqb	%xmm1,	%xmm13
1026	psrlq	\$4,	%xmm1		# 0x02...
1027	 lea	0x10($inp), $inp
1028
1029	pand	%xmm6,	%xmm14
1030	pand	%xmm6,	%xmm15
1031	 movdqa	%xmm10, 0x20($out)
1032	pcmpeqb	%xmm2,	%xmm14
1033	psrlq	\$4,	%xmm2		# 0x04...
1034	 movdqa	%xmm11, 0x30($out)
1035	pcmpeqb	%xmm3,	%xmm15
1036	psrlq	\$4,	%xmm3		# 0x08...
1037	 movdqu	($inp), %xmm6		# load next round key
1038
1039	pxor	%xmm5, %xmm13		# "pnot"
1040	pxor	%xmm5, %xmm14
1041	movdqa	%xmm12, 0x40($out)
1042	movdqa	%xmm13, 0x50($out)
1043	movdqa	%xmm14, 0x60($out)
1044	movdqa	%xmm15, 0x70($out)
1045	lea	0x80($out),$out
1046	dec	$rounds
1047	jnz	.Lkey_loop
1048
1049	movdqa	0x50($const), %xmm7	# .L63
1050	#movdqa	%xmm6, ($out)		# don't save last round key
1051	ret
1052.size	_bsaes_key_convert,.-_bsaes_key_convert
1053___
1054}
1055
1056if (0 && !$win64) {	# following four functions are unsupported interface
1057			# used for benchmarking...
1058$code.=<<___;
1059.globl	bsaes_enc_key_convert
1060.type	bsaes_enc_key_convert,\@function,2
1061.align	16
1062bsaes_enc_key_convert:
1063	_CET_ENDBR
1064	mov	240($inp),%r10d		# pass rounds
1065	mov	$inp,%rcx		# pass key
1066	mov	$out,%rax		# pass key schedule
1067	call	_bsaes_key_convert
1068	pxor	%xmm6,%xmm7		# fix up last round key
1069	movdqa	%xmm7,(%rax)		# save last round key
1070	ret
1071.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1072
1073.globl	bsaes_encrypt_128
1074.type	bsaes_encrypt_128,\@function,4
1075.align	16
1076bsaes_encrypt_128:
1077.Lenc128_loop:
1078	_CET_ENDBR
1079	movdqu	0x00($inp), @XMM[0]	# load input
1080	movdqu	0x10($inp), @XMM[1]
1081	movdqu	0x20($inp), @XMM[2]
1082	movdqu	0x30($inp), @XMM[3]
1083	movdqu	0x40($inp), @XMM[4]
1084	movdqu	0x50($inp), @XMM[5]
1085	movdqu	0x60($inp), @XMM[6]
1086	movdqu	0x70($inp), @XMM[7]
1087	mov	$key, %rax		# pass the $key
1088	lea	0x80($inp), $inp
1089	mov	\$10,%r10d
1090
1091	call	_bsaes_encrypt8
1092
1093	movdqu	@XMM[0], 0x00($out)	# write output
1094	movdqu	@XMM[1], 0x10($out)
1095	movdqu	@XMM[4], 0x20($out)
1096	movdqu	@XMM[6], 0x30($out)
1097	movdqu	@XMM[3], 0x40($out)
1098	movdqu	@XMM[7], 0x50($out)
1099	movdqu	@XMM[2], 0x60($out)
1100	movdqu	@XMM[5], 0x70($out)
1101	lea	0x80($out), $out
1102	sub	\$0x80,$len
1103	ja	.Lenc128_loop
1104	ret
1105.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1106
1107.globl	bsaes_dec_key_convert
1108.type	bsaes_dec_key_convert,\@function,2
1109.align	16
1110bsaes_dec_key_convert:
1111	_CET_ENDBR
1112	mov	240($inp),%r10d		# pass rounds
1113	mov	$inp,%rcx		# pass key
1114	mov	$out,%rax		# pass key schedule
1115	call	_bsaes_key_convert
1116	pxor	($out),%xmm7		# fix up round 0 key
1117	movdqa	%xmm6,(%rax)		# save last round key
1118	movdqa	%xmm7,($out)
1119	ret
1120.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1121
1122.globl	bsaes_decrypt_128
1123.type	bsaes_decrypt_128,\@function,4
1124.align	16
1125bsaes_decrypt_128:
1126	_CET_ENDBR
1127.Ldec128_loop:
1128	movdqu	0x00($inp), @XMM[0]	# load input
1129	movdqu	0x10($inp), @XMM[1]
1130	movdqu	0x20($inp), @XMM[2]
1131	movdqu	0x30($inp), @XMM[3]
1132	movdqu	0x40($inp), @XMM[4]
1133	movdqu	0x50($inp), @XMM[5]
1134	movdqu	0x60($inp), @XMM[6]
1135	movdqu	0x70($inp), @XMM[7]
1136	mov	$key, %rax		# pass the $key
1137	lea	0x80($inp), $inp
1138	mov	\$10,%r10d
1139
1140	call	_bsaes_decrypt8
1141
1142	movdqu	@XMM[0], 0x00($out)	# write output
1143	movdqu	@XMM[1], 0x10($out)
1144	movdqu	@XMM[6], 0x20($out)
1145	movdqu	@XMM[4], 0x30($out)
1146	movdqu	@XMM[2], 0x40($out)
1147	movdqu	@XMM[7], 0x50($out)
1148	movdqu	@XMM[3], 0x60($out)
1149	movdqu	@XMM[5], 0x70($out)
1150	lea	0x80($out), $out
1151	sub	\$0x80,$len
1152	ja	.Ldec128_loop
1153	ret
1154.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1155___
1156}
1157{
1158######################################################################
1159#
1160# OpenSSL interface
1161#
1162my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1163						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1164my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1165
1166if ($ecb) {
1167$code.=<<___;
1168.globl	bsaes_ecb_encrypt_blocks
1169.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1170.align	16
1171bsaes_ecb_encrypt_blocks:
1172	_CET_ENDBR
1173	mov	%rsp, %rax
1174.Lecb_enc_prologue:
1175	push	%rbp
1176	push	%rbx
1177	push	%r12
1178	push	%r13
1179	push	%r14
1180	push	%r15
1181	lea	-0x48(%rsp),%rsp
1182___
1183$code.=<<___ if ($win64);
1184	lea	-0xa0(%rsp), %rsp
1185	movaps	%xmm6, 0x40(%rsp)
1186	movaps	%xmm7, 0x50(%rsp)
1187	movaps	%xmm8, 0x60(%rsp)
1188	movaps	%xmm9, 0x70(%rsp)
1189	movaps	%xmm10, 0x80(%rsp)
1190	movaps	%xmm11, 0x90(%rsp)
1191	movaps	%xmm12, 0xa0(%rsp)
1192	movaps	%xmm13, 0xb0(%rsp)
1193	movaps	%xmm14, 0xc0(%rsp)
1194	movaps	%xmm15, 0xd0(%rsp)
1195.Lecb_enc_body:
1196___
1197$code.=<<___;
1198	mov	%rsp,%rbp		# backup %rsp
1199	mov	240($arg4),%eax		# rounds
1200	mov	$arg1,$inp		# backup arguments
1201	mov	$arg2,$out
1202	mov	$arg3,$len
1203	mov	$arg4,$key
1204	cmp	\$8,$arg3
1205	jb	.Lecb_enc_short
1206
1207	mov	%eax,%ebx		# backup rounds
1208	shl	\$7,%rax		# 128 bytes per inner round key
1209	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1210	sub	%rax,%rsp
1211	mov	%rsp,%rax		# pass key schedule
1212	mov	$key,%rcx		# pass key
1213	mov	%ebx,%r10d		# pass rounds
1214	call	_bsaes_key_convert
1215	pxor	%xmm6,%xmm7		# fix up last round key
1216	movdqa	%xmm7,(%rax)		# save last round key
1217
1218	sub	\$8,$len
1219.Lecb_enc_loop:
1220	movdqu	0x00($inp), @XMM[0]	# load input
1221	movdqu	0x10($inp), @XMM[1]
1222	movdqu	0x20($inp), @XMM[2]
1223	movdqu	0x30($inp), @XMM[3]
1224	movdqu	0x40($inp), @XMM[4]
1225	movdqu	0x50($inp), @XMM[5]
1226	mov	%rsp, %rax		# pass key schedule
1227	movdqu	0x60($inp), @XMM[6]
1228	mov	%ebx,%r10d		# pass rounds
1229	movdqu	0x70($inp), @XMM[7]
1230	lea	0x80($inp), $inp
1231
1232	call	_bsaes_encrypt8
1233
1234	movdqu	@XMM[0], 0x00($out)	# write output
1235	movdqu	@XMM[1], 0x10($out)
1236	movdqu	@XMM[4], 0x20($out)
1237	movdqu	@XMM[6], 0x30($out)
1238	movdqu	@XMM[3], 0x40($out)
1239	movdqu	@XMM[7], 0x50($out)
1240	movdqu	@XMM[2], 0x60($out)
1241	movdqu	@XMM[5], 0x70($out)
1242	lea	0x80($out), $out
1243	sub	\$8,$len
1244	jnc	.Lecb_enc_loop
1245
1246	add	\$8,$len
1247	jz	.Lecb_enc_done
1248
1249	movdqu	0x00($inp), @XMM[0]	# load input
1250	mov	%rsp, %rax		# pass key schedule
1251	mov	%ebx,%r10d		# pass rounds
1252	cmp	\$2,$len
1253	jb	.Lecb_enc_one
1254	movdqu	0x10($inp), @XMM[1]
1255	je	.Lecb_enc_two
1256	movdqu	0x20($inp), @XMM[2]
1257	cmp	\$4,$len
1258	jb	.Lecb_enc_three
1259	movdqu	0x30($inp), @XMM[3]
1260	je	.Lecb_enc_four
1261	movdqu	0x40($inp), @XMM[4]
1262	cmp	\$6,$len
1263	jb	.Lecb_enc_five
1264	movdqu	0x50($inp), @XMM[5]
1265	je	.Lecb_enc_six
1266	movdqu	0x60($inp), @XMM[6]
1267	call	_bsaes_encrypt8
1268	movdqu	@XMM[0], 0x00($out)	# write output
1269	movdqu	@XMM[1], 0x10($out)
1270	movdqu	@XMM[4], 0x20($out)
1271	movdqu	@XMM[6], 0x30($out)
1272	movdqu	@XMM[3], 0x40($out)
1273	movdqu	@XMM[7], 0x50($out)
1274	movdqu	@XMM[2], 0x60($out)
1275	jmp	.Lecb_enc_done
1276.align	16
1277.Lecb_enc_six:
1278	call	_bsaes_encrypt8
1279	movdqu	@XMM[0], 0x00($out)	# write output
1280	movdqu	@XMM[1], 0x10($out)
1281	movdqu	@XMM[4], 0x20($out)
1282	movdqu	@XMM[6], 0x30($out)
1283	movdqu	@XMM[3], 0x40($out)
1284	movdqu	@XMM[7], 0x50($out)
1285	jmp	.Lecb_enc_done
1286.align	16
1287.Lecb_enc_five:
1288	call	_bsaes_encrypt8
1289	movdqu	@XMM[0], 0x00($out)	# write output
1290	movdqu	@XMM[1], 0x10($out)
1291	movdqu	@XMM[4], 0x20($out)
1292	movdqu	@XMM[6], 0x30($out)
1293	movdqu	@XMM[3], 0x40($out)
1294	jmp	.Lecb_enc_done
1295.align	16
1296.Lecb_enc_four:
1297	call	_bsaes_encrypt8
1298	movdqu	@XMM[0], 0x00($out)	# write output
1299	movdqu	@XMM[1], 0x10($out)
1300	movdqu	@XMM[4], 0x20($out)
1301	movdqu	@XMM[6], 0x30($out)
1302	jmp	.Lecb_enc_done
1303.align	16
1304.Lecb_enc_three:
1305	call	_bsaes_encrypt8
1306	movdqu	@XMM[0], 0x00($out)	# write output
1307	movdqu	@XMM[1], 0x10($out)
1308	movdqu	@XMM[4], 0x20($out)
1309	jmp	.Lecb_enc_done
1310.align	16
1311.Lecb_enc_two:
1312	call	_bsaes_encrypt8
1313	movdqu	@XMM[0], 0x00($out)	# write output
1314	movdqu	@XMM[1], 0x10($out)
1315	jmp	.Lecb_enc_done
1316.align	16
1317.Lecb_enc_one:
1318	call	_bsaes_encrypt8
1319	movdqu	@XMM[0], 0x00($out)	# write output
1320	jmp	.Lecb_enc_done
1321.align	16
1322.Lecb_enc_short:
1323	lea	($inp), $arg1
1324	lea	($out), $arg2
1325	lea	($key), $arg3
1326	call	asm_AES_encrypt
1327	lea	16($inp), $inp
1328	lea	16($out), $out
1329	dec	$len
1330	jnz	.Lecb_enc_short
1331
1332.Lecb_enc_done:
1333	lea	(%rsp),%rax
1334	pxor	%xmm0, %xmm0
1335.Lecb_enc_bzero:			# wipe key schedule [if any]
1336	movdqa	%xmm0, 0x00(%rax)
1337	movdqa	%xmm0, 0x10(%rax)
1338	lea	0x20(%rax), %rax
1339	cmp	%rax, %rbp
1340	jb	.Lecb_enc_bzero
1341
1342	lea	(%rbp),%rsp		# restore %rsp
1343___
1344$code.=<<___ if ($win64);
1345	movaps	0x40(%rbp), %xmm6
1346	movaps	0x50(%rbp), %xmm7
1347	movaps	0x60(%rbp), %xmm8
1348	movaps	0x70(%rbp), %xmm9
1349	movaps	0x80(%rbp), %xmm10
1350	movaps	0x90(%rbp), %xmm11
1351	movaps	0xa0(%rbp), %xmm12
1352	movaps	0xb0(%rbp), %xmm13
1353	movaps	0xc0(%rbp), %xmm14
1354	movaps	0xd0(%rbp), %xmm15
1355	lea	0xa0(%rbp), %rsp
1356___
1357$code.=<<___;
1358	mov	0x48(%rsp), %r15
1359	mov	0x50(%rsp), %r14
1360	mov	0x58(%rsp), %r13
1361	mov	0x60(%rsp), %r12
1362	mov	0x68(%rsp), %rbx
1363	mov	0x70(%rsp), %rax
1364	lea	0x78(%rsp), %rsp
1365	mov	%rax, %rbp
1366.Lecb_enc_epilogue:
1367	ret
1368.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1369
1370.globl	bsaes_ecb_decrypt_blocks
1371.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1372.align	16
1373bsaes_ecb_decrypt_blocks:
1374	_CET_ENDBR
1375	mov	%rsp, %rax
1376.Lecb_dec_prologue:
1377	push	%rbp
1378	push	%rbx
1379	push	%r12
1380	push	%r13
1381	push	%r14
1382	push	%r15
1383	lea	-0x48(%rsp),%rsp
1384___
1385$code.=<<___ if ($win64);
1386	lea	-0xa0(%rsp), %rsp
1387	movaps	%xmm6, 0x40(%rsp)
1388	movaps	%xmm7, 0x50(%rsp)
1389	movaps	%xmm8, 0x60(%rsp)
1390	movaps	%xmm9, 0x70(%rsp)
1391	movaps	%xmm10, 0x80(%rsp)
1392	movaps	%xmm11, 0x90(%rsp)
1393	movaps	%xmm12, 0xa0(%rsp)
1394	movaps	%xmm13, 0xb0(%rsp)
1395	movaps	%xmm14, 0xc0(%rsp)
1396	movaps	%xmm15, 0xd0(%rsp)
1397.Lecb_dec_body:
1398___
1399$code.=<<___;
1400	mov	%rsp,%rbp		# backup %rsp
1401	mov	240($arg4),%eax		# rounds
1402	mov	$arg1,$inp		# backup arguments
1403	mov	$arg2,$out
1404	mov	$arg3,$len
1405	mov	$arg4,$key
1406	cmp	\$8,$arg3
1407	jb	.Lecb_dec_short
1408
1409	mov	%eax,%ebx		# backup rounds
1410	shl	\$7,%rax		# 128 bytes per inner round key
1411	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1412	sub	%rax,%rsp
1413	mov	%rsp,%rax		# pass key schedule
1414	mov	$key,%rcx		# pass key
1415	mov	%ebx,%r10d		# pass rounds
1416	call	_bsaes_key_convert
1417	pxor	(%rsp),%xmm7		# fix up 0 round key
1418	movdqa	%xmm6,(%rax)		# save last round key
1419	movdqa	%xmm7,(%rsp)
1420
1421	sub	\$8,$len
1422.Lecb_dec_loop:
1423	movdqu	0x00($inp), @XMM[0]	# load input
1424	movdqu	0x10($inp), @XMM[1]
1425	movdqu	0x20($inp), @XMM[2]
1426	movdqu	0x30($inp), @XMM[3]
1427	movdqu	0x40($inp), @XMM[4]
1428	movdqu	0x50($inp), @XMM[5]
1429	mov	%rsp, %rax		# pass key schedule
1430	movdqu	0x60($inp), @XMM[6]
1431	mov	%ebx,%r10d		# pass rounds
1432	movdqu	0x70($inp), @XMM[7]
1433	lea	0x80($inp), $inp
1434
1435	call	_bsaes_decrypt8
1436
1437	movdqu	@XMM[0], 0x00($out)	# write output
1438	movdqu	@XMM[1], 0x10($out)
1439	movdqu	@XMM[6], 0x20($out)
1440	movdqu	@XMM[4], 0x30($out)
1441	movdqu	@XMM[2], 0x40($out)
1442	movdqu	@XMM[7], 0x50($out)
1443	movdqu	@XMM[3], 0x60($out)
1444	movdqu	@XMM[5], 0x70($out)
1445	lea	0x80($out), $out
1446	sub	\$8,$len
1447	jnc	.Lecb_dec_loop
1448
1449	add	\$8,$len
1450	jz	.Lecb_dec_done
1451
1452	movdqu	0x00($inp), @XMM[0]	# load input
1453	mov	%rsp, %rax		# pass key schedule
1454	mov	%ebx,%r10d		# pass rounds
1455	cmp	\$2,$len
1456	jb	.Lecb_dec_one
1457	movdqu	0x10($inp), @XMM[1]
1458	je	.Lecb_dec_two
1459	movdqu	0x20($inp), @XMM[2]
1460	cmp	\$4,$len
1461	jb	.Lecb_dec_three
1462	movdqu	0x30($inp), @XMM[3]
1463	je	.Lecb_dec_four
1464	movdqu	0x40($inp), @XMM[4]
1465	cmp	\$6,$len
1466	jb	.Lecb_dec_five
1467	movdqu	0x50($inp), @XMM[5]
1468	je	.Lecb_dec_six
1469	movdqu	0x60($inp), @XMM[6]
1470	call	_bsaes_decrypt8
1471	movdqu	@XMM[0], 0x00($out)	# write output
1472	movdqu	@XMM[1], 0x10($out)
1473	movdqu	@XMM[6], 0x20($out)
1474	movdqu	@XMM[4], 0x30($out)
1475	movdqu	@XMM[2], 0x40($out)
1476	movdqu	@XMM[7], 0x50($out)
1477	movdqu	@XMM[3], 0x60($out)
1478	jmp	.Lecb_dec_done
1479.align	16
1480.Lecb_dec_six:
1481	call	_bsaes_decrypt8
1482	movdqu	@XMM[0], 0x00($out)	# write output
1483	movdqu	@XMM[1], 0x10($out)
1484	movdqu	@XMM[6], 0x20($out)
1485	movdqu	@XMM[4], 0x30($out)
1486	movdqu	@XMM[2], 0x40($out)
1487	movdqu	@XMM[7], 0x50($out)
1488	jmp	.Lecb_dec_done
1489.align	16
1490.Lecb_dec_five:
1491	call	_bsaes_decrypt8
1492	movdqu	@XMM[0], 0x00($out)	# write output
1493	movdqu	@XMM[1], 0x10($out)
1494	movdqu	@XMM[6], 0x20($out)
1495	movdqu	@XMM[4], 0x30($out)
1496	movdqu	@XMM[2], 0x40($out)
1497	jmp	.Lecb_dec_done
1498.align	16
1499.Lecb_dec_four:
1500	call	_bsaes_decrypt8
1501	movdqu	@XMM[0], 0x00($out)	# write output
1502	movdqu	@XMM[1], 0x10($out)
1503	movdqu	@XMM[6], 0x20($out)
1504	movdqu	@XMM[4], 0x30($out)
1505	jmp	.Lecb_dec_done
1506.align	16
1507.Lecb_dec_three:
1508	call	_bsaes_decrypt8
1509	movdqu	@XMM[0], 0x00($out)	# write output
1510	movdqu	@XMM[1], 0x10($out)
1511	movdqu	@XMM[6], 0x20($out)
1512	jmp	.Lecb_dec_done
1513.align	16
1514.Lecb_dec_two:
1515	call	_bsaes_decrypt8
1516	movdqu	@XMM[0], 0x00($out)	# write output
1517	movdqu	@XMM[1], 0x10($out)
1518	jmp	.Lecb_dec_done
1519.align	16
1520.Lecb_dec_one:
1521	call	_bsaes_decrypt8
1522	movdqu	@XMM[0], 0x00($out)	# write output
1523	jmp	.Lecb_dec_done
1524.align	16
1525.Lecb_dec_short:
1526	lea	($inp), $arg1
1527	lea	($out), $arg2
1528	lea	($key), $arg3
1529	call	asm_AES_decrypt
1530	lea	16($inp), $inp
1531	lea	16($out), $out
1532	dec	$len
1533	jnz	.Lecb_dec_short
1534
1535.Lecb_dec_done:
1536	lea	(%rsp),%rax
1537	pxor	%xmm0, %xmm0
1538.Lecb_dec_bzero:			# wipe key schedule [if any]
1539	movdqa	%xmm0, 0x00(%rax)
1540	movdqa	%xmm0, 0x10(%rax)
1541	lea	0x20(%rax), %rax
1542	cmp	%rax, %rbp
1543	jb	.Lecb_dec_bzero
1544
1545	lea	(%rbp),%rsp		# restore %rsp
1546___
1547$code.=<<___ if ($win64);
1548	movaps	0x40(%rbp), %xmm6
1549	movaps	0x50(%rbp), %xmm7
1550	movaps	0x60(%rbp), %xmm8
1551	movaps	0x70(%rbp), %xmm9
1552	movaps	0x80(%rbp), %xmm10
1553	movaps	0x90(%rbp), %xmm11
1554	movaps	0xa0(%rbp), %xmm12
1555	movaps	0xb0(%rbp), %xmm13
1556	movaps	0xc0(%rbp), %xmm14
1557	movaps	0xd0(%rbp), %xmm15
1558	lea	0xa0(%rbp), %rsp
1559___
1560$code.=<<___;
1561	mov	0x48(%rsp), %r15
1562	mov	0x50(%rsp), %r14
1563	mov	0x58(%rsp), %r13
1564	mov	0x60(%rsp), %r12
1565	mov	0x68(%rsp), %rbx
1566	mov	0x70(%rsp), %rax
1567	lea	0x78(%rsp), %rsp
1568	mov	%rax, %rbp
1569.Lecb_dec_epilogue:
1570	ret
1571.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1572___
1573}
1574$code.=<<___;
1575.extern	asm_AES_cbc_encrypt
1576.globl	bsaes_cbc_encrypt
1577.type	bsaes_cbc_encrypt,\@abi-omnipotent
1578.align	16
1579bsaes_cbc_encrypt:
1580	_CET_ENDBR
1581___
1582$code.=<<___ if ($win64);
1583	mov	48(%rsp),$arg6		# pull direction flag
1584___
1585$code.=<<___;
1586	cmp	\$0,$arg6
1587	jne	asm_AES_cbc_encrypt
1588	cmp	\$128,$arg3
1589	jb	asm_AES_cbc_encrypt
1590
1591	mov	%rsp, %rax
1592.Lcbc_dec_prologue:
1593	push	%rbp
1594	push	%rbx
1595	push	%r12
1596	push	%r13
1597	push	%r14
1598	push	%r15
1599	lea	-0x48(%rsp), %rsp
1600___
1601$code.=<<___ if ($win64);
1602	mov	0xa0(%rsp),$arg5	# pull ivp
1603	lea	-0xa0(%rsp), %rsp
1604	movaps	%xmm6, 0x40(%rsp)
1605	movaps	%xmm7, 0x50(%rsp)
1606	movaps	%xmm8, 0x60(%rsp)
1607	movaps	%xmm9, 0x70(%rsp)
1608	movaps	%xmm10, 0x80(%rsp)
1609	movaps	%xmm11, 0x90(%rsp)
1610	movaps	%xmm12, 0xa0(%rsp)
1611	movaps	%xmm13, 0xb0(%rsp)
1612	movaps	%xmm14, 0xc0(%rsp)
1613	movaps	%xmm15, 0xd0(%rsp)
1614.Lcbc_dec_body:
1615___
1616$code.=<<___;
1617	mov	%rsp, %rbp		# backup %rsp
1618	mov	240($arg4), %eax	# rounds
1619	mov	$arg1, $inp		# backup arguments
1620	mov	$arg2, $out
1621	mov	$arg3, $len
1622	mov	$arg4, $key
1623	mov	$arg5, %rbx
1624	shr	\$4, $len		# bytes to blocks
1625
1626	mov	%eax, %edx		# rounds
1627	shl	\$7, %rax		# 128 bytes per inner round key
1628	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1629	sub	%rax, %rsp
1630
1631	mov	%rsp, %rax		# pass key schedule
1632	mov	$key, %rcx		# pass key
1633	mov	%edx, %r10d		# pass rounds
1634	call	_bsaes_key_convert
1635	pxor	(%rsp),%xmm7		# fix up 0 round key
1636	movdqa	%xmm6,(%rax)		# save last round key
1637	movdqa	%xmm7,(%rsp)
1638
1639	movdqu	(%rbx), @XMM[15]	# load IV
1640	sub	\$8,$len
1641.Lcbc_dec_loop:
1642	movdqu	0x00($inp), @XMM[0]	# load input
1643	movdqu	0x10($inp), @XMM[1]
1644	movdqu	0x20($inp), @XMM[2]
1645	movdqu	0x30($inp), @XMM[3]
1646	movdqu	0x40($inp), @XMM[4]
1647	movdqu	0x50($inp), @XMM[5]
1648	mov	%rsp, %rax		# pass key schedule
1649	movdqu	0x60($inp), @XMM[6]
1650	mov	%edx,%r10d		# pass rounds
1651	movdqu	0x70($inp), @XMM[7]
1652	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1653
1654	call	_bsaes_decrypt8
1655
1656	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1657	movdqu	0x00($inp), @XMM[8]	# re-load input
1658	movdqu	0x10($inp), @XMM[9]
1659	pxor	@XMM[8], @XMM[1]
1660	movdqu	0x20($inp), @XMM[10]
1661	pxor	@XMM[9], @XMM[6]
1662	movdqu	0x30($inp), @XMM[11]
1663	pxor	@XMM[10], @XMM[4]
1664	movdqu	0x40($inp), @XMM[12]
1665	pxor	@XMM[11], @XMM[2]
1666	movdqu	0x50($inp), @XMM[13]
1667	pxor	@XMM[12], @XMM[7]
1668	movdqu	0x60($inp), @XMM[14]
1669	pxor	@XMM[13], @XMM[3]
1670	movdqu	0x70($inp), @XMM[15]	# IV
1671	pxor	@XMM[14], @XMM[5]
1672	movdqu	@XMM[0], 0x00($out)	# write output
1673	lea	0x80($inp), $inp
1674	movdqu	@XMM[1], 0x10($out)
1675	movdqu	@XMM[6], 0x20($out)
1676	movdqu	@XMM[4], 0x30($out)
1677	movdqu	@XMM[2], 0x40($out)
1678	movdqu	@XMM[7], 0x50($out)
1679	movdqu	@XMM[3], 0x60($out)
1680	movdqu	@XMM[5], 0x70($out)
1681	lea	0x80($out), $out
1682	sub	\$8,$len
1683	jnc	.Lcbc_dec_loop
1684
1685	add	\$8,$len
1686	jz	.Lcbc_dec_done
1687
1688	movdqu	0x00($inp), @XMM[0]	# load input
1689	mov	%rsp, %rax		# pass key schedule
1690	mov	%edx, %r10d		# pass rounds
1691	cmp	\$2,$len
1692	jb	.Lcbc_dec_one
1693	movdqu	0x10($inp), @XMM[1]
1694	je	.Lcbc_dec_two
1695	movdqu	0x20($inp), @XMM[2]
1696	cmp	\$4,$len
1697	jb	.Lcbc_dec_three
1698	movdqu	0x30($inp), @XMM[3]
1699	je	.Lcbc_dec_four
1700	movdqu	0x40($inp), @XMM[4]
1701	cmp	\$6,$len
1702	jb	.Lcbc_dec_five
1703	movdqu	0x50($inp), @XMM[5]
1704	je	.Lcbc_dec_six
1705	movdqu	0x60($inp), @XMM[6]
1706	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1707	call	_bsaes_decrypt8
1708	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1709	movdqu	0x00($inp), @XMM[8]	# re-load input
1710	movdqu	0x10($inp), @XMM[9]
1711	pxor	@XMM[8], @XMM[1]
1712	movdqu	0x20($inp), @XMM[10]
1713	pxor	@XMM[9], @XMM[6]
1714	movdqu	0x30($inp), @XMM[11]
1715	pxor	@XMM[10], @XMM[4]
1716	movdqu	0x40($inp), @XMM[12]
1717	pxor	@XMM[11], @XMM[2]
1718	movdqu	0x50($inp), @XMM[13]
1719	pxor	@XMM[12], @XMM[7]
1720	movdqu	0x60($inp), @XMM[15]	# IV
1721	pxor	@XMM[13], @XMM[3]
1722	movdqu	@XMM[0], 0x00($out)	# write output
1723	movdqu	@XMM[1], 0x10($out)
1724	movdqu	@XMM[6], 0x20($out)
1725	movdqu	@XMM[4], 0x30($out)
1726	movdqu	@XMM[2], 0x40($out)
1727	movdqu	@XMM[7], 0x50($out)
1728	movdqu	@XMM[3], 0x60($out)
1729	jmp	.Lcbc_dec_done
1730.align	16
1731.Lcbc_dec_six:
1732	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1733	call	_bsaes_decrypt8
1734	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1735	movdqu	0x00($inp), @XMM[8]	# re-load input
1736	movdqu	0x10($inp), @XMM[9]
1737	pxor	@XMM[8], @XMM[1]
1738	movdqu	0x20($inp), @XMM[10]
1739	pxor	@XMM[9], @XMM[6]
1740	movdqu	0x30($inp), @XMM[11]
1741	pxor	@XMM[10], @XMM[4]
1742	movdqu	0x40($inp), @XMM[12]
1743	pxor	@XMM[11], @XMM[2]
1744	movdqu	0x50($inp), @XMM[15]	# IV
1745	pxor	@XMM[12], @XMM[7]
1746	movdqu	@XMM[0], 0x00($out)	# write output
1747	movdqu	@XMM[1], 0x10($out)
1748	movdqu	@XMM[6], 0x20($out)
1749	movdqu	@XMM[4], 0x30($out)
1750	movdqu	@XMM[2], 0x40($out)
1751	movdqu	@XMM[7], 0x50($out)
1752	jmp	.Lcbc_dec_done
1753.align	16
1754.Lcbc_dec_five:
1755	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1756	call	_bsaes_decrypt8
1757	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1758	movdqu	0x00($inp), @XMM[8]	# re-load input
1759	movdqu	0x10($inp), @XMM[9]
1760	pxor	@XMM[8], @XMM[1]
1761	movdqu	0x20($inp), @XMM[10]
1762	pxor	@XMM[9], @XMM[6]
1763	movdqu	0x30($inp), @XMM[11]
1764	pxor	@XMM[10], @XMM[4]
1765	movdqu	0x40($inp), @XMM[15]	# IV
1766	pxor	@XMM[11], @XMM[2]
1767	movdqu	@XMM[0], 0x00($out)	# write output
1768	movdqu	@XMM[1], 0x10($out)
1769	movdqu	@XMM[6], 0x20($out)
1770	movdqu	@XMM[4], 0x30($out)
1771	movdqu	@XMM[2], 0x40($out)
1772	jmp	.Lcbc_dec_done
1773.align	16
1774.Lcbc_dec_four:
1775	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1776	call	_bsaes_decrypt8
1777	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1778	movdqu	0x00($inp), @XMM[8]	# re-load input
1779	movdqu	0x10($inp), @XMM[9]
1780	pxor	@XMM[8], @XMM[1]
1781	movdqu	0x20($inp), @XMM[10]
1782	pxor	@XMM[9], @XMM[6]
1783	movdqu	0x30($inp), @XMM[15]	# IV
1784	pxor	@XMM[10], @XMM[4]
1785	movdqu	@XMM[0], 0x00($out)	# write output
1786	movdqu	@XMM[1], 0x10($out)
1787	movdqu	@XMM[6], 0x20($out)
1788	movdqu	@XMM[4], 0x30($out)
1789	jmp	.Lcbc_dec_done
1790.align	16
1791.Lcbc_dec_three:
1792	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1793	call	_bsaes_decrypt8
1794	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1795	movdqu	0x00($inp), @XMM[8]	# re-load input
1796	movdqu	0x10($inp), @XMM[9]
1797	pxor	@XMM[8], @XMM[1]
1798	movdqu	0x20($inp), @XMM[15]	# IV
1799	pxor	@XMM[9], @XMM[6]
1800	movdqu	@XMM[0], 0x00($out)	# write output
1801	movdqu	@XMM[1], 0x10($out)
1802	movdqu	@XMM[6], 0x20($out)
1803	jmp	.Lcbc_dec_done
1804.align	16
1805.Lcbc_dec_two:
1806	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1807	call	_bsaes_decrypt8
1808	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1809	movdqu	0x00($inp), @XMM[8]	# re-load input
1810	movdqu	0x10($inp), @XMM[15]	# IV
1811	pxor	@XMM[8], @XMM[1]
1812	movdqu	@XMM[0], 0x00($out)	# write output
1813	movdqu	@XMM[1], 0x10($out)
1814	jmp	.Lcbc_dec_done
1815.align	16
1816.Lcbc_dec_one:
1817	lea	($inp), $arg1
1818	lea	0x20(%rbp), $arg2	# buffer output
1819	lea	($key), $arg3
1820	call	asm_AES_decrypt		# doesn't touch %xmm
1821	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1822	movdqu	@XMM[15], ($out)	# write output
1823	movdqa	@XMM[0], @XMM[15]	# IV
1824
1825.Lcbc_dec_done:
1826	movdqu	@XMM[15], (%rbx)	# return IV
1827	lea	(%rsp), %rax
1828	pxor	%xmm0, %xmm0
1829.Lcbc_dec_bzero:			# wipe key schedule [if any]
1830	movdqa	%xmm0, 0x00(%rax)
1831	movdqa	%xmm0, 0x10(%rax)
1832	lea	0x20(%rax), %rax
1833	cmp	%rax, %rbp
1834	ja	.Lcbc_dec_bzero
1835
1836	lea	(%rbp),%rsp		# restore %rsp
1837___
1838$code.=<<___ if ($win64);
1839	movaps	0x40(%rbp), %xmm6
1840	movaps	0x50(%rbp), %xmm7
1841	movaps	0x60(%rbp), %xmm8
1842	movaps	0x70(%rbp), %xmm9
1843	movaps	0x80(%rbp), %xmm10
1844	movaps	0x90(%rbp), %xmm11
1845	movaps	0xa0(%rbp), %xmm12
1846	movaps	0xb0(%rbp), %xmm13
1847	movaps	0xc0(%rbp), %xmm14
1848	movaps	0xd0(%rbp), %xmm15
1849	lea	0xa0(%rbp), %rsp
1850___
1851$code.=<<___;
1852	mov	0x48(%rsp), %r15
1853	mov	0x50(%rsp), %r14
1854	mov	0x58(%rsp), %r13
1855	mov	0x60(%rsp), %r12
1856	mov	0x68(%rsp), %rbx
1857	mov	0x70(%rsp), %rax
1858	lea	0x78(%rsp), %rsp
1859	mov	%rax, %rbp
1860.Lcbc_dec_epilogue:
1861	ret
1862.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1863
1864.globl	bsaes_ctr32_encrypt_blocks
1865.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1866.align	16
1867bsaes_ctr32_encrypt_blocks:
1868	_CET_ENDBR
1869	mov	%rsp, %rax
1870.Lctr_enc_prologue:
1871	push	%rbp
1872	push	%rbx
1873	push	%r12
1874	push	%r13
1875	push	%r14
1876	push	%r15
1877	lea	-0x48(%rsp), %rsp
1878___
1879$code.=<<___ if ($win64);
1880	mov	0xa0(%rsp),$arg5	# pull ivp
1881	lea	-0xa0(%rsp), %rsp
1882	movaps	%xmm6, 0x40(%rsp)
1883	movaps	%xmm7, 0x50(%rsp)
1884	movaps	%xmm8, 0x60(%rsp)
1885	movaps	%xmm9, 0x70(%rsp)
1886	movaps	%xmm10, 0x80(%rsp)
1887	movaps	%xmm11, 0x90(%rsp)
1888	movaps	%xmm12, 0xa0(%rsp)
1889	movaps	%xmm13, 0xb0(%rsp)
1890	movaps	%xmm14, 0xc0(%rsp)
1891	movaps	%xmm15, 0xd0(%rsp)
1892.Lctr_enc_body:
1893___
1894$code.=<<___;
1895	mov	%rsp, %rbp		# backup %rsp
1896	movdqu	($arg5), %xmm0		# load counter
1897	mov	240($arg4), %eax	# rounds
1898	mov	$arg1, $inp		# backup arguments
1899	mov	$arg2, $out
1900	mov	$arg3, $len
1901	mov	$arg4, $key
1902	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1903	cmp	\$8, $arg3
1904	jb	.Lctr_enc_short
1905
1906	mov	%eax, %ebx		# rounds
1907	shl	\$7, %rax		# 128 bytes per inner round key
1908	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1909	sub	%rax, %rsp
1910
1911	mov	%rsp, %rax		# pass key schedule
1912	mov	$key, %rcx		# pass key
1913	mov	%ebx, %r10d		# pass rounds
1914	call	_bsaes_key_convert
1915	pxor	%xmm6,%xmm7		# fix up last round key
1916	movdqa	%xmm7,(%rax)		# save last round key
1917
1918	movdqa	(%rsp), @XMM[9]		# load round0 key
1919	lea	.LADD1(%rip), %r11
1920	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1921	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1922	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1923	pshufb	@XMM[8], @XMM[0]
1924	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1925	jmp	.Lctr_enc_loop
1926.align	16
1927.Lctr_enc_loop:
1928	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1929	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1930	movdqa	@XMM[0], @XMM[2]
1931	paddd	0x00(%r11), @XMM[1]	# .LADD1
1932	movdqa	@XMM[0], @XMM[3]
1933	paddd	0x10(%r11), @XMM[2]	# .LADD2
1934	movdqa	@XMM[0], @XMM[4]
1935	paddd	0x20(%r11), @XMM[3]	# .LADD3
1936	movdqa	@XMM[0], @XMM[5]
1937	paddd	0x30(%r11), @XMM[4]	# .LADD4
1938	movdqa	@XMM[0], @XMM[6]
1939	paddd	0x40(%r11), @XMM[5]	# .LADD5
1940	movdqa	@XMM[0], @XMM[7]
1941	paddd	0x50(%r11), @XMM[6]	# .LADD6
1942	paddd	0x60(%r11), @XMM[7]	# .LADD7
1943
1944	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
1945	# to flip byte order in 32-bit counter
1946	movdqa	(%rsp), @XMM[9]		# round 0 key
1947	lea	0x10(%rsp), %rax	# pass key schedule
1948	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
1949	pxor	@XMM[9], @XMM[0]	# xor with round0 key
1950	pxor	@XMM[9], @XMM[1]
1951	 pshufb	@XMM[8], @XMM[0]
1952	pxor	@XMM[9], @XMM[2]
1953	 pshufb	@XMM[8], @XMM[1]
1954	pxor	@XMM[9], @XMM[3]
1955	 pshufb	@XMM[8], @XMM[2]
1956	pxor	@XMM[9], @XMM[4]
1957	 pshufb	@XMM[8], @XMM[3]
1958	pxor	@XMM[9], @XMM[5]
1959	 pshufb	@XMM[8], @XMM[4]
1960	pxor	@XMM[9], @XMM[6]
1961	 pshufb	@XMM[8], @XMM[5]
1962	pxor	@XMM[9], @XMM[7]
1963	 pshufb	@XMM[8], @XMM[6]
1964	lea	.LBS0(%rip), %r11	# constants table
1965	 pshufb	@XMM[8], @XMM[7]
1966	mov	%ebx,%r10d		# pass rounds
1967
1968	call	_bsaes_encrypt8_bitslice
1969
1970	sub	\$8,$len
1971	jc	.Lctr_enc_loop_done
1972
1973	movdqu	0x00($inp), @XMM[8]	# load input
1974	movdqu	0x10($inp), @XMM[9]
1975	movdqu	0x20($inp), @XMM[10]
1976	movdqu	0x30($inp), @XMM[11]
1977	movdqu	0x40($inp), @XMM[12]
1978	movdqu	0x50($inp), @XMM[13]
1979	movdqu	0x60($inp), @XMM[14]
1980	movdqu	0x70($inp), @XMM[15]
1981	lea	0x80($inp),$inp
1982	pxor	@XMM[0], @XMM[8]
1983	movdqa	0x20(%rbp), @XMM[0]	# load counter
1984	pxor	@XMM[9], @XMM[1]
1985	movdqu	@XMM[8], 0x00($out)	# write output
1986	pxor	@XMM[10], @XMM[4]
1987	movdqu	@XMM[1], 0x10($out)
1988	pxor	@XMM[11], @XMM[6]
1989	movdqu	@XMM[4], 0x20($out)
1990	pxor	@XMM[12], @XMM[3]
1991	movdqu	@XMM[6], 0x30($out)
1992	pxor	@XMM[13], @XMM[7]
1993	movdqu	@XMM[3], 0x40($out)
1994	pxor	@XMM[14], @XMM[2]
1995	movdqu	@XMM[7], 0x50($out)
1996	pxor	@XMM[15], @XMM[5]
1997	movdqu	@XMM[2], 0x60($out)
1998	lea	.LADD1(%rip), %r11
1999	movdqu	@XMM[5], 0x70($out)
2000	lea	0x80($out), $out
2001	paddd	0x70(%r11), @XMM[0]	# .LADD8
2002	jnz	.Lctr_enc_loop
2003
2004	jmp	.Lctr_enc_done
2005.align	16
2006.Lctr_enc_loop_done:
2007	add	\$8, $len
2008	movdqu	0x00($inp), @XMM[8]	# load input
2009	pxor	@XMM[8], @XMM[0]
2010	movdqu	@XMM[0], 0x00($out)	# write output
2011	cmp	\$2,$len
2012	jb	.Lctr_enc_done
2013	movdqu	0x10($inp), @XMM[9]
2014	pxor	@XMM[9], @XMM[1]
2015	movdqu	@XMM[1], 0x10($out)
2016	je	.Lctr_enc_done
2017	movdqu	0x20($inp), @XMM[10]
2018	pxor	@XMM[10], @XMM[4]
2019	movdqu	@XMM[4], 0x20($out)
2020	cmp	\$4,$len
2021	jb	.Lctr_enc_done
2022	movdqu	0x30($inp), @XMM[11]
2023	pxor	@XMM[11], @XMM[6]
2024	movdqu	@XMM[6], 0x30($out)
2025	je	.Lctr_enc_done
2026	movdqu	0x40($inp), @XMM[12]
2027	pxor	@XMM[12], @XMM[3]
2028	movdqu	@XMM[3], 0x40($out)
2029	cmp	\$6,$len
2030	jb	.Lctr_enc_done
2031	movdqu	0x50($inp), @XMM[13]
2032	pxor	@XMM[13], @XMM[7]
2033	movdqu	@XMM[7], 0x50($out)
2034	je	.Lctr_enc_done
2035	movdqu	0x60($inp), @XMM[14]
2036	pxor	@XMM[14], @XMM[2]
2037	movdqu	@XMM[2], 0x60($out)
2038	jmp	.Lctr_enc_done
2039
2040.align	16
2041.Lctr_enc_short:
2042	lea	0x20(%rbp), $arg1
2043	lea	0x30(%rbp), $arg2
2044	lea	($key), $arg3
2045	call	asm_AES_encrypt
2046	movdqu	($inp), @XMM[1]
2047	lea	16($inp), $inp
2048	mov	0x2c(%rbp), %eax	# load 32-bit counter
2049	bswap	%eax
2050	pxor	0x30(%rbp), @XMM[1]
2051	inc	%eax			# increment
2052	movdqu	@XMM[1], ($out)
2053	bswap	%eax
2054	lea	16($out), $out
2055	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
2056	dec	$len
2057	jnz	.Lctr_enc_short
2058
2059.Lctr_enc_done:
2060	lea	(%rsp), %rax
2061	pxor	%xmm0, %xmm0
2062.Lctr_enc_bzero:			# wipe key schedule [if any]
2063	movdqa	%xmm0, 0x00(%rax)
2064	movdqa	%xmm0, 0x10(%rax)
2065	lea	0x20(%rax), %rax
2066	cmp	%rax, %rbp
2067	ja	.Lctr_enc_bzero
2068
2069	lea	(%rbp),%rsp		# restore %rsp
2070___
2071$code.=<<___ if ($win64);
2072	movaps	0x40(%rbp), %xmm6
2073	movaps	0x50(%rbp), %xmm7
2074	movaps	0x60(%rbp), %xmm8
2075	movaps	0x70(%rbp), %xmm9
2076	movaps	0x80(%rbp), %xmm10
2077	movaps	0x90(%rbp), %xmm11
2078	movaps	0xa0(%rbp), %xmm12
2079	movaps	0xb0(%rbp), %xmm13
2080	movaps	0xc0(%rbp), %xmm14
2081	movaps	0xd0(%rbp), %xmm15
2082	lea	0xa0(%rbp), %rsp
2083___
2084$code.=<<___;
2085	mov	0x48(%rsp), %r15
2086	mov	0x50(%rsp), %r14
2087	mov	0x58(%rsp), %r13
2088	mov	0x60(%rsp), %r12
2089	mov	0x68(%rsp), %rbx
2090	mov	0x70(%rsp), %rax
2091	lea	0x78(%rsp), %rsp
2092	mov	%rax, %rbp
2093.Lctr_enc_epilogue:
2094	ret
2095.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2096___
2097######################################################################
2098# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2099#	const AES_KEY *key1, const AES_KEY *key2,
2100#	const unsigned char iv[16]);
2101#
2102my ($twmask,$twres,$twtmp)=@XMM[13..15];
2103$arg6=~s/d$//;
2104
2105$code.=<<___;
2106.globl	bsaes_xts_encrypt
2107.type	bsaes_xts_encrypt,\@abi-omnipotent
2108.align	16
2109bsaes_xts_encrypt:
2110	_CET_ENDBR
2111	mov	%rsp, %rax
2112.Lxts_enc_prologue:
2113	push	%rbp
2114	push	%rbx
2115	push	%r12
2116	push	%r13
2117	push	%r14
2118	push	%r15
2119	lea	-0x48(%rsp), %rsp
2120___
2121$code.=<<___ if ($win64);
2122	mov	0xa0(%rsp),$arg5	# pull key2
2123	mov	0xa8(%rsp),$arg6	# pull ivp
2124	lea	-0xa0(%rsp), %rsp
2125	movaps	%xmm6, 0x40(%rsp)
2126	movaps	%xmm7, 0x50(%rsp)
2127	movaps	%xmm8, 0x60(%rsp)
2128	movaps	%xmm9, 0x70(%rsp)
2129	movaps	%xmm10, 0x80(%rsp)
2130	movaps	%xmm11, 0x90(%rsp)
2131	movaps	%xmm12, 0xa0(%rsp)
2132	movaps	%xmm13, 0xb0(%rsp)
2133	movaps	%xmm14, 0xc0(%rsp)
2134	movaps	%xmm15, 0xd0(%rsp)
2135.Lxts_enc_body:
2136___
2137$code.=<<___;
2138	mov	%rsp, %rbp		# backup %rsp
2139	mov	$arg1, $inp		# backup arguments
2140	mov	$arg2, $out
2141	mov	$arg3, $len
2142	mov	$arg4, $key
2143
2144	lea	($arg6), $arg1
2145	lea	0x20(%rbp), $arg2
2146	lea	($arg5), $arg3
2147	call	asm_AES_encrypt		# generate initial tweak
2148
2149	mov	240($key), %eax		# rounds
2150	mov	$len, %rbx		# backup $len
2151
2152	mov	%eax, %edx		# rounds
2153	shl	\$7, %rax		# 128 bytes per inner round key
2154	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2155	sub	%rax, %rsp
2156
2157	mov	%rsp, %rax		# pass key schedule
2158	mov	$key, %rcx		# pass key
2159	mov	%edx, %r10d		# pass rounds
2160	call	_bsaes_key_convert
2161	pxor	%xmm6, %xmm7		# fix up last round key
2162	movdqa	%xmm7, (%rax)		# save last round key
2163
2164	and	\$-16, $len
2165	sub	\$0x80, %rsp		# place for tweak[8]
2166	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2167
2168	pxor	$twtmp, $twtmp
2169	movdqa	.Lxts_magic(%rip), $twmask
2170	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2171
2172	sub	\$0x80, $len
2173	jc	.Lxts_enc_short
2174	jmp	.Lxts_enc_loop
2175
2176.align	16
2177.Lxts_enc_loop:
2178___
2179    for ($i=0;$i<7;$i++) {
2180    $code.=<<___;
2181	pshufd	\$0x13, $twtmp, $twres
2182	pxor	$twtmp, $twtmp
2183	movdqa	@XMM[7], @XMM[$i]
2184	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2185	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2186	pand	$twmask, $twres		# isolate carry and residue
2187	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2188	pxor	$twres, @XMM[7]
2189___
2190    $code.=<<___ if ($i>=1);
2191	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2192___
2193    $code.=<<___ if ($i>=2);
2194	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2195___
2196    }
2197$code.=<<___;
2198	movdqu	0x60($inp), @XMM[8+6]
2199	pxor	@XMM[8+5], @XMM[5]
2200	movdqu	0x70($inp), @XMM[8+7]
2201	lea	0x80($inp), $inp
2202	movdqa	@XMM[7], 0x70(%rsp)
2203	pxor	@XMM[8+6], @XMM[6]
2204	lea	0x80(%rsp), %rax	# pass key schedule
2205	pxor	@XMM[8+7], @XMM[7]
2206	mov	%edx, %r10d		# pass rounds
2207
2208	call	_bsaes_encrypt8
2209
2210	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2211	pxor	0x10(%rsp), @XMM[1]
2212	movdqu	@XMM[0], 0x00($out)	# write output
2213	pxor	0x20(%rsp), @XMM[4]
2214	movdqu	@XMM[1], 0x10($out)
2215	pxor	0x30(%rsp), @XMM[6]
2216	movdqu	@XMM[4], 0x20($out)
2217	pxor	0x40(%rsp), @XMM[3]
2218	movdqu	@XMM[6], 0x30($out)
2219	pxor	0x50(%rsp), @XMM[7]
2220	movdqu	@XMM[3], 0x40($out)
2221	pxor	0x60(%rsp), @XMM[2]
2222	movdqu	@XMM[7], 0x50($out)
2223	pxor	0x70(%rsp), @XMM[5]
2224	movdqu	@XMM[2], 0x60($out)
2225	movdqu	@XMM[5], 0x70($out)
2226	lea	0x80($out), $out
2227
2228	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2229	pxor	$twtmp, $twtmp
2230	movdqa	.Lxts_magic(%rip), $twmask
2231	pcmpgtd	@XMM[7], $twtmp
2232	pshufd	\$0x13, $twtmp, $twres
2233	pxor	$twtmp, $twtmp
2234	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2235	pand	$twmask, $twres		# isolate carry and residue
2236	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2237	pxor	$twres, @XMM[7]
2238
2239	sub	\$0x80,$len
2240	jnc	.Lxts_enc_loop
2241
2242.Lxts_enc_short:
2243	add	\$0x80, $len
2244	jz	.Lxts_enc_done
2245___
2246    for ($i=0;$i<7;$i++) {
2247    $code.=<<___;
2248	pshufd	\$0x13, $twtmp, $twres
2249	pxor	$twtmp, $twtmp
2250	movdqa	@XMM[7], @XMM[$i]
2251	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2252	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2253	pand	$twmask, $twres		# isolate carry and residue
2254	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2255	pxor	$twres, @XMM[7]
2256___
2257    $code.=<<___ if ($i>=1);
2258	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2259	cmp	\$`0x10*$i`,$len
2260	je	.Lxts_enc_$i
2261___
2262    $code.=<<___ if ($i>=2);
2263	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2264___
2265    }
2266$code.=<<___;
2267	movdqu	0x60($inp), @XMM[8+6]
2268	pxor	@XMM[8+5], @XMM[5]
2269	movdqa	@XMM[7], 0x70(%rsp)
2270	lea	0x70($inp), $inp
2271	pxor	@XMM[8+6], @XMM[6]
2272	lea	0x80(%rsp), %rax	# pass key schedule
2273	mov	%edx, %r10d		# pass rounds
2274
2275	call	_bsaes_encrypt8
2276
2277	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2278	pxor	0x10(%rsp), @XMM[1]
2279	movdqu	@XMM[0], 0x00($out)	# write output
2280	pxor	0x20(%rsp), @XMM[4]
2281	movdqu	@XMM[1], 0x10($out)
2282	pxor	0x30(%rsp), @XMM[6]
2283	movdqu	@XMM[4], 0x20($out)
2284	pxor	0x40(%rsp), @XMM[3]
2285	movdqu	@XMM[6], 0x30($out)
2286	pxor	0x50(%rsp), @XMM[7]
2287	movdqu	@XMM[3], 0x40($out)
2288	pxor	0x60(%rsp), @XMM[2]
2289	movdqu	@XMM[7], 0x50($out)
2290	movdqu	@XMM[2], 0x60($out)
2291	lea	0x70($out), $out
2292
2293	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2294	jmp	.Lxts_enc_done
2295.align	16
2296.Lxts_enc_6:
2297	pxor	@XMM[8+4], @XMM[4]
2298	lea	0x60($inp), $inp
2299	pxor	@XMM[8+5], @XMM[5]
2300	lea	0x80(%rsp), %rax	# pass key schedule
2301	mov	%edx, %r10d		# pass rounds
2302
2303	call	_bsaes_encrypt8
2304
2305	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2306	pxor	0x10(%rsp), @XMM[1]
2307	movdqu	@XMM[0], 0x00($out)	# write output
2308	pxor	0x20(%rsp), @XMM[4]
2309	movdqu	@XMM[1], 0x10($out)
2310	pxor	0x30(%rsp), @XMM[6]
2311	movdqu	@XMM[4], 0x20($out)
2312	pxor	0x40(%rsp), @XMM[3]
2313	movdqu	@XMM[6], 0x30($out)
2314	pxor	0x50(%rsp), @XMM[7]
2315	movdqu	@XMM[3], 0x40($out)
2316	movdqu	@XMM[7], 0x50($out)
2317	lea	0x60($out), $out
2318
2319	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2320	jmp	.Lxts_enc_done
2321.align	16
2322.Lxts_enc_5:
2323	pxor	@XMM[8+3], @XMM[3]
2324	lea	0x50($inp), $inp
2325	pxor	@XMM[8+4], @XMM[4]
2326	lea	0x80(%rsp), %rax	# pass key schedule
2327	mov	%edx, %r10d		# pass rounds
2328
2329	call	_bsaes_encrypt8
2330
2331	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2332	pxor	0x10(%rsp), @XMM[1]
2333	movdqu	@XMM[0], 0x00($out)	# write output
2334	pxor	0x20(%rsp), @XMM[4]
2335	movdqu	@XMM[1], 0x10($out)
2336	pxor	0x30(%rsp), @XMM[6]
2337	movdqu	@XMM[4], 0x20($out)
2338	pxor	0x40(%rsp), @XMM[3]
2339	movdqu	@XMM[6], 0x30($out)
2340	movdqu	@XMM[3], 0x40($out)
2341	lea	0x50($out), $out
2342
2343	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2344	jmp	.Lxts_enc_done
2345.align	16
2346.Lxts_enc_4:
2347	pxor	@XMM[8+2], @XMM[2]
2348	lea	0x40($inp), $inp
2349	pxor	@XMM[8+3], @XMM[3]
2350	lea	0x80(%rsp), %rax	# pass key schedule
2351	mov	%edx, %r10d		# pass rounds
2352
2353	call	_bsaes_encrypt8
2354
2355	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2356	pxor	0x10(%rsp), @XMM[1]
2357	movdqu	@XMM[0], 0x00($out)	# write output
2358	pxor	0x20(%rsp), @XMM[4]
2359	movdqu	@XMM[1], 0x10($out)
2360	pxor	0x30(%rsp), @XMM[6]
2361	movdqu	@XMM[4], 0x20($out)
2362	movdqu	@XMM[6], 0x30($out)
2363	lea	0x40($out), $out
2364
2365	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2366	jmp	.Lxts_enc_done
2367.align	16
2368.Lxts_enc_3:
2369	pxor	@XMM[8+1], @XMM[1]
2370	lea	0x30($inp), $inp
2371	pxor	@XMM[8+2], @XMM[2]
2372	lea	0x80(%rsp), %rax	# pass key schedule
2373	mov	%edx, %r10d		# pass rounds
2374
2375	call	_bsaes_encrypt8
2376
2377	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2378	pxor	0x10(%rsp), @XMM[1]
2379	movdqu	@XMM[0], 0x00($out)	# write output
2380	pxor	0x20(%rsp), @XMM[4]
2381	movdqu	@XMM[1], 0x10($out)
2382	movdqu	@XMM[4], 0x20($out)
2383	lea	0x30($out), $out
2384
2385	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2386	jmp	.Lxts_enc_done
2387.align	16
2388.Lxts_enc_2:
2389	pxor	@XMM[8+0], @XMM[0]
2390	lea	0x20($inp), $inp
2391	pxor	@XMM[8+1], @XMM[1]
2392	lea	0x80(%rsp), %rax	# pass key schedule
2393	mov	%edx, %r10d		# pass rounds
2394
2395	call	_bsaes_encrypt8
2396
2397	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2398	pxor	0x10(%rsp), @XMM[1]
2399	movdqu	@XMM[0], 0x00($out)	# write output
2400	movdqu	@XMM[1], 0x10($out)
2401	lea	0x20($out), $out
2402
2403	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2404	jmp	.Lxts_enc_done
2405.align	16
2406.Lxts_enc_1:
2407	pxor	@XMM[0], @XMM[8]
2408	lea	0x10($inp), $inp
2409	movdqa	@XMM[8], 0x20(%rbp)
2410	lea	0x20(%rbp), $arg1
2411	lea	0x20(%rbp), $arg2
2412	lea	($key), $arg3
2413	call	asm_AES_encrypt		# doesn't touch %xmm
2414	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2415	#pxor	@XMM[8], @XMM[0]
2416	#lea	0x80(%rsp), %rax	# pass key schedule
2417	#mov	%edx, %r10d		# pass rounds
2418	#call	_bsaes_encrypt8
2419	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2420	movdqu	@XMM[0], 0x00($out)	# write output
2421	lea	0x10($out), $out
2422
2423	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2424
2425.Lxts_enc_done:
2426	and	\$15, %ebx
2427	jz	.Lxts_enc_ret
2428	mov	$out, %rdx
2429
2430.Lxts_enc_steal:
2431	movzb	($inp), %eax
2432	movzb	-16(%rdx), %ecx
2433	lea	1($inp), $inp
2434	mov	%al, -16(%rdx)
2435	mov	%cl, 0(%rdx)
2436	lea	1(%rdx), %rdx
2437	sub	\$1,%ebx
2438	jnz	.Lxts_enc_steal
2439
2440	movdqu	-16($out), @XMM[0]
2441	lea	0x20(%rbp), $arg1
2442	pxor	@XMM[7], @XMM[0]
2443	lea	0x20(%rbp), $arg2
2444	movdqa	@XMM[0], 0x20(%rbp)
2445	lea	($key), $arg3
2446	call	asm_AES_encrypt		# doesn't touch %xmm
2447	pxor	0x20(%rbp), @XMM[7]
2448	movdqu	@XMM[7], -16($out)
2449
2450.Lxts_enc_ret:
2451	lea	(%rsp), %rax
2452	pxor	%xmm0, %xmm0
2453.Lxts_enc_bzero:			# wipe key schedule [if any]
2454	movdqa	%xmm0, 0x00(%rax)
2455	movdqa	%xmm0, 0x10(%rax)
2456	lea	0x20(%rax), %rax
2457	cmp	%rax, %rbp
2458	ja	.Lxts_enc_bzero
2459
2460	lea	(%rbp),%rsp		# restore %rsp
2461___
2462$code.=<<___ if ($win64);
2463	movaps	0x40(%rbp), %xmm6
2464	movaps	0x50(%rbp), %xmm7
2465	movaps	0x60(%rbp), %xmm8
2466	movaps	0x70(%rbp), %xmm9
2467	movaps	0x80(%rbp), %xmm10
2468	movaps	0x90(%rbp), %xmm11
2469	movaps	0xa0(%rbp), %xmm12
2470	movaps	0xb0(%rbp), %xmm13
2471	movaps	0xc0(%rbp), %xmm14
2472	movaps	0xd0(%rbp), %xmm15
2473	lea	0xa0(%rbp), %rsp
2474___
2475$code.=<<___;
2476	mov	0x48(%rsp), %r15
2477	mov	0x50(%rsp), %r14
2478	mov	0x58(%rsp), %r13
2479	mov	0x60(%rsp), %r12
2480	mov	0x68(%rsp), %rbx
2481	mov	0x70(%rsp), %rax
2482	lea	0x78(%rsp), %rsp
2483	mov	%rax, %rbp
2484.Lxts_enc_epilogue:
2485	ret
2486.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
2487
2488.globl	bsaes_xts_decrypt
2489.type	bsaes_xts_decrypt,\@abi-omnipotent
2490.align	16
2491bsaes_xts_decrypt:
2492	_CET_ENDBR
2493	mov	%rsp, %rax
2494.Lxts_dec_prologue:
2495	push	%rbp
2496	push	%rbx
2497	push	%r12
2498	push	%r13
2499	push	%r14
2500	push	%r15
2501	lea	-0x48(%rsp), %rsp
2502___
2503$code.=<<___ if ($win64);
2504	mov	0xa0(%rsp),$arg5	# pull key2
2505	mov	0xa8(%rsp),$arg6	# pull ivp
2506	lea	-0xa0(%rsp), %rsp
2507	movaps	%xmm6, 0x40(%rsp)
2508	movaps	%xmm7, 0x50(%rsp)
2509	movaps	%xmm8, 0x60(%rsp)
2510	movaps	%xmm9, 0x70(%rsp)
2511	movaps	%xmm10, 0x80(%rsp)
2512	movaps	%xmm11, 0x90(%rsp)
2513	movaps	%xmm12, 0xa0(%rsp)
2514	movaps	%xmm13, 0xb0(%rsp)
2515	movaps	%xmm14, 0xc0(%rsp)
2516	movaps	%xmm15, 0xd0(%rsp)
2517.Lxts_dec_body:
2518___
2519$code.=<<___;
2520	mov	%rsp, %rbp		# backup %rsp
2521	mov	$arg1, $inp		# backup arguments
2522	mov	$arg2, $out
2523	mov	$arg3, $len
2524	mov	$arg4, $key
2525
2526	lea	($arg6), $arg1
2527	lea	0x20(%rbp), $arg2
2528	lea	($arg5), $arg3
2529	call	asm_AES_encrypt		# generate initial tweak
2530
2531	mov	240($key), %eax		# rounds
2532	mov	$len, %rbx		# backup $len
2533
2534	mov	%eax, %edx		# rounds
2535	shl	\$7, %rax		# 128 bytes per inner round key
2536	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2537	sub	%rax, %rsp
2538
2539	mov	%rsp, %rax		# pass key schedule
2540	mov	$key, %rcx		# pass key
2541	mov	%edx, %r10d		# pass rounds
2542	call	_bsaes_key_convert
2543	pxor	(%rsp), %xmm7		# fix up round 0 key
2544	movdqa	%xmm6, (%rax)		# save last round key
2545	movdqa	%xmm7, (%rsp)
2546
2547	xor	%eax, %eax		# if ($len%16) len-=16;
2548	and	\$-16, $len
2549	test	\$15, %ebx
2550	setnz	%al
2551	shl	\$4, %rax
2552	sub	%rax, $len
2553
2554	sub	\$0x80, %rsp		# place for tweak[8]
2555	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2556
2557	pxor	$twtmp, $twtmp
2558	movdqa	.Lxts_magic(%rip), $twmask
2559	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2560
2561	sub	\$0x80, $len
2562	jc	.Lxts_dec_short
2563	jmp	.Lxts_dec_loop
2564
2565.align	16
2566.Lxts_dec_loop:
2567___
2568    for ($i=0;$i<7;$i++) {
2569    $code.=<<___;
2570	pshufd	\$0x13, $twtmp, $twres
2571	pxor	$twtmp, $twtmp
2572	movdqa	@XMM[7], @XMM[$i]
2573	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2574	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2575	pand	$twmask, $twres		# isolate carry and residue
2576	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2577	pxor	$twres, @XMM[7]
2578___
2579    $code.=<<___ if ($i>=1);
2580	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2581___
2582    $code.=<<___ if ($i>=2);
2583	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2584___
2585    }
2586$code.=<<___;
2587	movdqu	0x60($inp), @XMM[8+6]
2588	pxor	@XMM[8+5], @XMM[5]
2589	movdqu	0x70($inp), @XMM[8+7]
2590	lea	0x80($inp), $inp
2591	movdqa	@XMM[7], 0x70(%rsp)
2592	pxor	@XMM[8+6], @XMM[6]
2593	lea	0x80(%rsp), %rax	# pass key schedule
2594	pxor	@XMM[8+7], @XMM[7]
2595	mov	%edx, %r10d		# pass rounds
2596
2597	call	_bsaes_decrypt8
2598
2599	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2600	pxor	0x10(%rsp), @XMM[1]
2601	movdqu	@XMM[0], 0x00($out)	# write output
2602	pxor	0x20(%rsp), @XMM[6]
2603	movdqu	@XMM[1], 0x10($out)
2604	pxor	0x30(%rsp), @XMM[4]
2605	movdqu	@XMM[6], 0x20($out)
2606	pxor	0x40(%rsp), @XMM[2]
2607	movdqu	@XMM[4], 0x30($out)
2608	pxor	0x50(%rsp), @XMM[7]
2609	movdqu	@XMM[2], 0x40($out)
2610	pxor	0x60(%rsp), @XMM[3]
2611	movdqu	@XMM[7], 0x50($out)
2612	pxor	0x70(%rsp), @XMM[5]
2613	movdqu	@XMM[3], 0x60($out)
2614	movdqu	@XMM[5], 0x70($out)
2615	lea	0x80($out), $out
2616
2617	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2618	pxor	$twtmp, $twtmp
2619	movdqa	.Lxts_magic(%rip), $twmask
2620	pcmpgtd	@XMM[7], $twtmp
2621	pshufd	\$0x13, $twtmp, $twres
2622	pxor	$twtmp, $twtmp
2623	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2624	pand	$twmask, $twres		# isolate carry and residue
2625	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2626	pxor	$twres, @XMM[7]
2627
2628	sub	\$0x80,$len
2629	jnc	.Lxts_dec_loop
2630
2631.Lxts_dec_short:
2632	add	\$0x80, $len
2633	jz	.Lxts_dec_done
2634___
2635    for ($i=0;$i<7;$i++) {
2636    $code.=<<___;
2637	pshufd	\$0x13, $twtmp, $twres
2638	pxor	$twtmp, $twtmp
2639	movdqa	@XMM[7], @XMM[$i]
2640	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2641	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2642	pand	$twmask, $twres		# isolate carry and residue
2643	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2644	pxor	$twres, @XMM[7]
2645___
2646    $code.=<<___ if ($i>=1);
2647	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2648	cmp	\$`0x10*$i`,$len
2649	je	.Lxts_dec_$i
2650___
2651    $code.=<<___ if ($i>=2);
2652	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2653___
2654    }
2655$code.=<<___;
2656	movdqu	0x60($inp), @XMM[8+6]
2657	pxor	@XMM[8+5], @XMM[5]
2658	movdqa	@XMM[7], 0x70(%rsp)
2659	lea	0x70($inp), $inp
2660	pxor	@XMM[8+6], @XMM[6]
2661	lea	0x80(%rsp), %rax	# pass key schedule
2662	mov	%edx, %r10d		# pass rounds
2663
2664	call	_bsaes_decrypt8
2665
2666	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2667	pxor	0x10(%rsp), @XMM[1]
2668	movdqu	@XMM[0], 0x00($out)	# write output
2669	pxor	0x20(%rsp), @XMM[6]
2670	movdqu	@XMM[1], 0x10($out)
2671	pxor	0x30(%rsp), @XMM[4]
2672	movdqu	@XMM[6], 0x20($out)
2673	pxor	0x40(%rsp), @XMM[2]
2674	movdqu	@XMM[4], 0x30($out)
2675	pxor	0x50(%rsp), @XMM[7]
2676	movdqu	@XMM[2], 0x40($out)
2677	pxor	0x60(%rsp), @XMM[3]
2678	movdqu	@XMM[7], 0x50($out)
2679	movdqu	@XMM[3], 0x60($out)
2680	lea	0x70($out), $out
2681
2682	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2683	jmp	.Lxts_dec_done
2684.align	16
2685.Lxts_dec_6:
2686	pxor	@XMM[8+4], @XMM[4]
2687	lea	0x60($inp), $inp
2688	pxor	@XMM[8+5], @XMM[5]
2689	lea	0x80(%rsp), %rax	# pass key schedule
2690	mov	%edx, %r10d		# pass rounds
2691
2692	call	_bsaes_decrypt8
2693
2694	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2695	pxor	0x10(%rsp), @XMM[1]
2696	movdqu	@XMM[0], 0x00($out)	# write output
2697	pxor	0x20(%rsp), @XMM[6]
2698	movdqu	@XMM[1], 0x10($out)
2699	pxor	0x30(%rsp), @XMM[4]
2700	movdqu	@XMM[6], 0x20($out)
2701	pxor	0x40(%rsp), @XMM[2]
2702	movdqu	@XMM[4], 0x30($out)
2703	pxor	0x50(%rsp), @XMM[7]
2704	movdqu	@XMM[2], 0x40($out)
2705	movdqu	@XMM[7], 0x50($out)
2706	lea	0x60($out), $out
2707
2708	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2709	jmp	.Lxts_dec_done
2710.align	16
2711.Lxts_dec_5:
2712	pxor	@XMM[8+3], @XMM[3]
2713	lea	0x50($inp), $inp
2714	pxor	@XMM[8+4], @XMM[4]
2715	lea	0x80(%rsp), %rax	# pass key schedule
2716	mov	%edx, %r10d		# pass rounds
2717
2718	call	_bsaes_decrypt8
2719
2720	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2721	pxor	0x10(%rsp), @XMM[1]
2722	movdqu	@XMM[0], 0x00($out)	# write output
2723	pxor	0x20(%rsp), @XMM[6]
2724	movdqu	@XMM[1], 0x10($out)
2725	pxor	0x30(%rsp), @XMM[4]
2726	movdqu	@XMM[6], 0x20($out)
2727	pxor	0x40(%rsp), @XMM[2]
2728	movdqu	@XMM[4], 0x30($out)
2729	movdqu	@XMM[2], 0x40($out)
2730	lea	0x50($out), $out
2731
2732	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2733	jmp	.Lxts_dec_done
2734.align	16
2735.Lxts_dec_4:
2736	pxor	@XMM[8+2], @XMM[2]
2737	lea	0x40($inp), $inp
2738	pxor	@XMM[8+3], @XMM[3]
2739	lea	0x80(%rsp), %rax	# pass key schedule
2740	mov	%edx, %r10d		# pass rounds
2741
2742	call	_bsaes_decrypt8
2743
2744	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2745	pxor	0x10(%rsp), @XMM[1]
2746	movdqu	@XMM[0], 0x00($out)	# write output
2747	pxor	0x20(%rsp), @XMM[6]
2748	movdqu	@XMM[1], 0x10($out)
2749	pxor	0x30(%rsp), @XMM[4]
2750	movdqu	@XMM[6], 0x20($out)
2751	movdqu	@XMM[4], 0x30($out)
2752	lea	0x40($out), $out
2753
2754	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2755	jmp	.Lxts_dec_done
2756.align	16
2757.Lxts_dec_3:
2758	pxor	@XMM[8+1], @XMM[1]
2759	lea	0x30($inp), $inp
2760	pxor	@XMM[8+2], @XMM[2]
2761	lea	0x80(%rsp), %rax	# pass key schedule
2762	mov	%edx, %r10d		# pass rounds
2763
2764	call	_bsaes_decrypt8
2765
2766	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2767	pxor	0x10(%rsp), @XMM[1]
2768	movdqu	@XMM[0], 0x00($out)	# write output
2769	pxor	0x20(%rsp), @XMM[6]
2770	movdqu	@XMM[1], 0x10($out)
2771	movdqu	@XMM[6], 0x20($out)
2772	lea	0x30($out), $out
2773
2774	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2775	jmp	.Lxts_dec_done
2776.align	16
2777.Lxts_dec_2:
2778	pxor	@XMM[8+0], @XMM[0]
2779	lea	0x20($inp), $inp
2780	pxor	@XMM[8+1], @XMM[1]
2781	lea	0x80(%rsp), %rax	# pass key schedule
2782	mov	%edx, %r10d		# pass rounds
2783
2784	call	_bsaes_decrypt8
2785
2786	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2787	pxor	0x10(%rsp), @XMM[1]
2788	movdqu	@XMM[0], 0x00($out)	# write output
2789	movdqu	@XMM[1], 0x10($out)
2790	lea	0x20($out), $out
2791
2792	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2793	jmp	.Lxts_dec_done
2794.align	16
2795.Lxts_dec_1:
2796	pxor	@XMM[0], @XMM[8]
2797	lea	0x10($inp), $inp
2798	movdqa	@XMM[8], 0x20(%rbp)
2799	lea	0x20(%rbp), $arg1
2800	lea	0x20(%rbp), $arg2
2801	lea	($key), $arg3
2802	call	asm_AES_decrypt		# doesn't touch %xmm
2803	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2804	#pxor	@XMM[8], @XMM[0]
2805	#lea	0x80(%rsp), %rax	# pass key schedule
2806	#mov	%edx, %r10d		# pass rounds
2807	#call	_bsaes_decrypt8
2808	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2809	movdqu	@XMM[0], 0x00($out)	# write output
2810	lea	0x10($out), $out
2811
2812	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2813
2814.Lxts_dec_done:
2815	and	\$15, %ebx
2816	jz	.Lxts_dec_ret
2817
2818	pxor	$twtmp, $twtmp
2819	movdqa	.Lxts_magic(%rip), $twmask
2820	pcmpgtd	@XMM[7], $twtmp
2821	pshufd	\$0x13, $twtmp, $twres
2822	movdqa	@XMM[7], @XMM[6]
2823	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2824	pand	$twmask, $twres		# isolate carry and residue
2825	movdqu	($inp), @XMM[0]
2826	pxor	$twres, @XMM[7]
2827
2828	lea	0x20(%rbp), $arg1
2829	pxor	@XMM[7], @XMM[0]
2830	lea	0x20(%rbp), $arg2
2831	movdqa	@XMM[0], 0x20(%rbp)
2832	lea	($key), $arg3
2833	call	asm_AES_decrypt		# doesn't touch %xmm
2834	pxor	0x20(%rbp), @XMM[7]
2835	mov	$out, %rdx
2836	movdqu	@XMM[7], ($out)
2837
2838.Lxts_dec_steal:
2839	movzb	16($inp), %eax
2840	movzb	(%rdx), %ecx
2841	lea	1($inp), $inp
2842	mov	%al, (%rdx)
2843	mov	%cl, 16(%rdx)
2844	lea	1(%rdx), %rdx
2845	sub	\$1,%ebx
2846	jnz	.Lxts_dec_steal
2847
2848	movdqu	($out), @XMM[0]
2849	lea	0x20(%rbp), $arg1
2850	pxor	@XMM[6], @XMM[0]
2851	lea	0x20(%rbp), $arg2
2852	movdqa	@XMM[0], 0x20(%rbp)
2853	lea	($key), $arg3
2854	call	asm_AES_decrypt		# doesn't touch %xmm
2855	pxor	0x20(%rbp), @XMM[6]
2856	movdqu	@XMM[6], ($out)
2857
2858.Lxts_dec_ret:
2859	lea	(%rsp), %rax
2860	pxor	%xmm0, %xmm0
2861.Lxts_dec_bzero:			# wipe key schedule [if any]
2862	movdqa	%xmm0, 0x00(%rax)
2863	movdqa	%xmm0, 0x10(%rax)
2864	lea	0x20(%rax), %rax
2865	cmp	%rax, %rbp
2866	ja	.Lxts_dec_bzero
2867
2868	lea	(%rbp),%rsp		# restore %rsp
2869___
2870$code.=<<___ if ($win64);
2871	movaps	0x40(%rbp), %xmm6
2872	movaps	0x50(%rbp), %xmm7
2873	movaps	0x60(%rbp), %xmm8
2874	movaps	0x70(%rbp), %xmm9
2875	movaps	0x80(%rbp), %xmm10
2876	movaps	0x90(%rbp), %xmm11
2877	movaps	0xa0(%rbp), %xmm12
2878	movaps	0xb0(%rbp), %xmm13
2879	movaps	0xc0(%rbp), %xmm14
2880	movaps	0xd0(%rbp), %xmm15
2881	lea	0xa0(%rbp), %rsp
2882___
2883$code.=<<___;
2884	mov	0x48(%rsp), %r15
2885	mov	0x50(%rsp), %r14
2886	mov	0x58(%rsp), %r13
2887	mov	0x60(%rsp), %r12
2888	mov	0x68(%rsp), %rbx
2889	mov	0x70(%rsp), %rax
2890	lea	0x78(%rsp), %rsp
2891	mov	%rax, %rbp
2892.Lxts_dec_epilogue:
2893	ret
2894.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
2895___
2896}
2897$code.=<<___;
2898.section .rodata
2899.type	_bsaes_const,\@object
2900.align	64
2901_bsaes_const:
2902.LM0ISR:	# InvShiftRows constants
2903	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
2904.LISRM0:
2905	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
2906.LISR:
2907	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
2908.LBS0:		# bit-slice constants
2909	.quad	0x5555555555555555, 0x5555555555555555
2910.LBS1:
2911	.quad	0x3333333333333333, 0x3333333333333333
2912.LBS2:
2913	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2914.LSR:		# shiftrows constants
2915	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
2916.LSRM0:
2917	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
2918.LM0SR:
2919	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
2920.LSWPUP:	# byte-swap upper dword
2921	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
2922.LSWPUPM0SR:
2923	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
2924.LADD1:		# counter increment constants
2925	.quad	0x0000000000000000, 0x0000000100000000
2926.LADD2:
2927	.quad	0x0000000000000000, 0x0000000200000000
2928.LADD3:
2929	.quad	0x0000000000000000, 0x0000000300000000
2930.LADD4:
2931	.quad	0x0000000000000000, 0x0000000400000000
2932.LADD5:
2933	.quad	0x0000000000000000, 0x0000000500000000
2934.LADD6:
2935	.quad	0x0000000000000000, 0x0000000600000000
2936.LADD7:
2937	.quad	0x0000000000000000, 0x0000000700000000
2938.LADD8:
2939	.quad	0x0000000000000000, 0x0000000800000000
2940.Lxts_magic:
2941	.long	0x87,0,1,0
2942.Lmasks:
2943	.quad	0x0101010101010101, 0x0101010101010101
2944	.quad	0x0202020202020202, 0x0202020202020202
2945	.quad	0x0404040404040404, 0x0404040404040404
2946	.quad	0x0808080808080808, 0x0808080808080808
2947.LM0:
2948	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
2949.L63:
2950	.quad	0x6363636363636363, 0x6363636363636363
2951.align	64
2952.size	_bsaes_const,.-_bsaes_const
2953.text
2954___
2955
2956# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2957#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2958if ($win64) {
2959$rec="%rcx";
2960$frame="%rdx";
2961$context="%r8";
2962$disp="%r9";
2963
2964$code.=<<___;
2965.extern	__imp_RtlVirtualUnwind
2966.type	se_handler,\@abi-omnipotent
2967.align	16
2968se_handler:
2969	_CET_ENDBR
2970	push	%rsi
2971	push	%rdi
2972	push	%rbx
2973	push	%rbp
2974	push	%r12
2975	push	%r13
2976	push	%r14
2977	push	%r15
2978	pushfq
2979	sub	\$64,%rsp
2980
2981	mov	120($context),%rax	# pull context->Rax
2982	mov	248($context),%rbx	# pull context->Rip
2983
2984	mov	8($disp),%rsi		# disp->ImageBase
2985	mov	56($disp),%r11		# disp->HandlerData
2986
2987	mov	0(%r11),%r10d		# HandlerData[0]
2988	lea	(%rsi,%r10),%r10	# prologue label
2989	cmp	%r10,%rbx		# context->Rip<prologue label
2990	jb	.Lin_prologue
2991
2992	mov	152($context),%rax	# pull context->Rsp
2993
2994	mov	4(%r11),%r10d		# HandlerData[1]
2995	lea	(%rsi,%r10),%r10	# epilogue label
2996	cmp	%r10,%rbx		# context->Rip>=epilogue label
2997	jae	.Lin_prologue
2998
2999	mov	160($context),%rax	# pull context->Rbp
3000
3001	lea	0x40(%rax),%rsi		# %xmm save area
3002	lea	512($context),%rdi	# &context.Xmm6
3003	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
3004	.long	0xa548f3fc		# cld; rep movsq
3005	lea	0xa0(%rax),%rax		# adjust stack pointer
3006
3007	mov	0x70(%rax),%rbp
3008	mov	0x68(%rax),%rbx
3009	mov	0x60(%rax),%r12
3010	mov	0x58(%rax),%r13
3011	mov	0x50(%rax),%r14
3012	mov	0x48(%rax),%r15
3013	lea	0x78(%rax),%rax		# adjust stack pointer
3014	mov	%rbx,144($context)	# restore context->Rbx
3015	mov	%rbp,160($context)	# restore context->Rbp
3016	mov	%r12,216($context)	# restore context->R12
3017	mov	%r13,224($context)	# restore context->R13
3018	mov	%r14,232($context)	# restore context->R14
3019	mov	%r15,240($context)	# restore context->R15
3020
3021.Lin_prologue:
3022	mov	%rax,152($context)	# restore context->Rsp
3023
3024	mov	40($disp),%rdi		# disp->ContextRecord
3025	mov	$context,%rsi		# context
3026	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
3027	.long	0xa548f3fc		# cld; rep movsq
3028
3029	mov	$disp,%rsi
3030	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3031	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3032	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3033	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3034	mov	40(%rsi),%r10		# disp->ContextRecord
3035	lea	56(%rsi),%r11		# &disp->HandlerData
3036	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3037	mov	%r10,32(%rsp)		# arg5
3038	mov	%r11,40(%rsp)		# arg6
3039	mov	%r12,48(%rsp)		# arg7
3040	mov	%rcx,56(%rsp)		# arg8, (NULL)
3041	call	*__imp_RtlVirtualUnwind(%rip)
3042
3043	mov	\$1,%eax		# ExceptionContinueSearch
3044	add	\$64,%rsp
3045	popfq
3046	pop	%r15
3047	pop	%r14
3048	pop	%r13
3049	pop	%r12
3050	pop	%rbp
3051	pop	%rbx
3052	pop	%rdi
3053	pop	%rsi
3054	ret
3055.size	se_handler,.-se_handler
3056
3057.section	.pdata
3058.align	4
3059___
3060$code.=<<___ if ($ecb);
3061	.rva	.Lecb_enc_prologue
3062	.rva	.Lecb_enc_epilogue
3063	.rva	.Lecb_enc_info
3064
3065	.rva	.Lecb_dec_prologue
3066	.rva	.Lecb_dec_epilogue
3067	.rva	.Lecb_dec_info
3068___
3069$code.=<<___;
3070	.rva	.Lcbc_dec_prologue
3071	.rva	.Lcbc_dec_epilogue
3072	.rva	.Lcbc_dec_info
3073
3074	.rva	.Lctr_enc_prologue
3075	.rva	.Lctr_enc_epilogue
3076	.rva	.Lctr_enc_info
3077
3078	.rva	.Lxts_enc_prologue
3079	.rva	.Lxts_enc_epilogue
3080	.rva	.Lxts_enc_info
3081
3082	.rva	.Lxts_dec_prologue
3083	.rva	.Lxts_dec_epilogue
3084	.rva	.Lxts_dec_info
3085
3086.section	.xdata
3087.align	8
3088___
3089$code.=<<___ if ($ecb);
3090.Lecb_enc_info:
3091	.byte	9,0,0,0
3092	.rva	se_handler
3093	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3094.Lecb_dec_info:
3095	.byte	9,0,0,0
3096	.rva	se_handler
3097	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3098___
3099$code.=<<___;
3100.Lcbc_dec_info:
3101	.byte	9,0,0,0
3102	.rva	se_handler
3103	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3104.Lctr_enc_info:
3105	.byte	9,0,0,0
3106	.rva	se_handler
3107	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3108.Lxts_enc_info:
3109	.byte	9,0,0,0
3110	.rva	se_handler
3111	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3112.Lxts_dec_info:
3113	.byte	9,0,0,0
3114	.rva	se_handler
3115	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3116___
3117}
3118
3119$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3120
3121print $code;
3122
3123close STDOUT;
3124