1#! /usr/bin/env perl
2# Copyright 2011-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10###################################################################
11### AES-128 [originally in CTR mode]				###
12### bitsliced implementation for Intel Core 2 processors	###
13### requires support of SSE extensions up to SSSE3		###
14### Author: Emilia Käsper and Peter Schwabe			###
15### Date: 2009-03-19						###
16### Public domain						###
17###								###
18### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
19### further information.					###
20###################################################################
21#
22# September 2011.
23#
24# Started as transliteration to "perlasm" the original code has
25# undergone following changes:
26#
27# - code was made position-independent;
28# - rounds were folded into a loop resulting in >5x size reduction
29#   from 12.5KB to 2.2KB;
30# - above was possible thanks to mixcolumns() modification that
31#   allowed to feed its output back to aesenc[last], this was
32#   achieved at cost of two additional inter-registers moves;
33# - some instruction reordering and interleaving;
34# - this module doesn't implement key setup subroutine, instead it
35#   relies on conversion of "conventional" key schedule as returned
36#   by AES_set_encrypt_key (see discussion below);
37# - first and last round keys are treated differently, which allowed
38#   to skip one shiftrows(), reduce bit-sliced key schedule and
39#   speed-up conversion by 22%;
40# - support for 192- and 256-bit keys was added;
41#
42# Resulting performance in CPU cycles spent to encrypt one byte out
43# of 4096-byte buffer with 128-bit key is:
44#
45#		Emilia's	this(*)		difference
46#
47# Core 2    	9.30		8.69		+7%
48# Nehalem(**) 	7.63		6.88		+11%
49# Atom	    	17.1		16.4		+4%
50# Silvermont	-		12.9
51# Goldmont	-		8.85
52#
53# (*)	Comparison is not completely fair, because "this" is ECB,
54#	i.e. no extra processing such as counter values calculation
55#	and xor-ing input as in Emilia's CTR implementation is
56#	performed. However, the CTR calculations stand for not more
57#	than 1% of total time, so comparison is *rather* fair.
58#
59# (**)	Results were collected on Westmere, which is considered to
60#	be equivalent to Nehalem for this code.
61#
62# As for key schedule conversion subroutine. Interface to OpenSSL
63# relies on per-invocation on-the-fly conversion. This naturally
64# has impact on performance, especially for short inputs. Conversion
65# time in CPU cycles and its ratio to CPU cycles spent in 8x block
66# function is:
67#
68# 		conversion	conversion/8x block
69# Core 2	240		0.22
70# Nehalem	180		0.20
71# Atom		430		0.20
72#
73# The ratio values mean that 128-byte blocks will be processed
74# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
75# etc. Then keep in mind that input sizes not divisible by 128 are
76# *effectively* slower, especially shortest ones, e.g. consecutive
77# 144-byte blocks are processed 44% slower than one would expect,
78# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
79# it's still faster than ["hyper-threading-safe" code path in]
80# aes-x86_64.pl on all lengths above 64 bytes...
81#
82# October 2011.
83#
84# Add decryption procedure. Performance in CPU cycles spent to decrypt
85# one byte out of 4096-byte buffer with 128-bit key is:
86#
87# Core 2	9.98
88# Nehalem	7.80
89# Atom		17.9
90# Silvermont	14.0
91# Goldmont	10.2
92#
93# November 2011.
94#
95# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
96# suboptimal, but XTS is meant to be used with larger blocks...
97#
98#						<appro@openssl.org>
99
100# $output is the last argument if it looks like a file (it has an extension)
101# $flavour is the first argument if it doesn't look like a file
102$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
103$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
104
105$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106
107$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
108( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
109( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
110die "can't locate x86_64-xlate.pl";
111
112open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
113    or die "can't call $xlate: $!";
114*STDOUT=*OUT;
115
116my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
117my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
118my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
119
120{
121my ($key,$rounds,$const)=("%rax","%r10d","%r11");
122
123sub Sbox {
124# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
125# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
126my @b=@_[0..7];
127my @t=@_[8..11];
128my @s=@_[12..15];
129	&InBasisChange	(@b);
130	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
131	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
132}
133
134sub InBasisChange {
135# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
136# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
137my @b=@_[0..7];
138$code.=<<___;
139	pxor	@b[6], @b[5]
140	pxor	@b[1], @b[2]
141	pxor	@b[0], @b[3]
142	pxor	@b[2], @b[6]
143	pxor 	@b[0], @b[5]
144
145	pxor	@b[3], @b[6]
146	pxor	@b[7], @b[3]
147	pxor	@b[5], @b[7]
148	pxor	@b[4], @b[3]
149	pxor	@b[5], @b[4]
150	pxor	@b[1], @b[3]
151
152	pxor	@b[7], @b[2]
153	pxor	@b[5], @b[1]
154___
155}
156
157sub OutBasisChange {
158# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
159# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
160my @b=@_[0..7];
161$code.=<<___;
162	pxor	@b[6], @b[0]
163	pxor	@b[4], @b[1]
164	pxor	@b[0], @b[2]
165	pxor	@b[6], @b[4]
166	pxor	@b[1], @b[6]
167
168	pxor	@b[5], @b[1]
169	pxor	@b[3], @b[5]
170	pxor	@b[7], @b[3]
171	pxor	@b[5], @b[7]
172	pxor	@b[5], @b[2]
173
174	pxor	@b[7], @b[4]
175___
176}
177
178sub InvSbox {
179# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
180# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
181my @b=@_[0..7];
182my @t=@_[8..11];
183my @s=@_[12..15];
184	&InvInBasisChange	(@b);
185	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
186	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
187}
188
189sub InvInBasisChange {		# OutBasisChange in reverse
190my @b=@_[5,1,2,6,3,7,0,4];
191$code.=<<___
192	pxor	@b[7], @b[4]
193
194	pxor	@b[5], @b[7]
195	pxor	@b[5], @b[2]
196	pxor	@b[7], @b[3]
197	pxor	@b[3], @b[5]
198	pxor	@b[5], @b[1]
199
200	pxor	@b[1], @b[6]
201	pxor	@b[0], @b[2]
202	pxor	@b[6], @b[4]
203	pxor	@b[6], @b[0]
204	pxor	@b[4], @b[1]
205___
206}
207
208sub InvOutBasisChange {		# InBasisChange in reverse
209my @b=@_[2,5,7,3,6,1,0,4];
210$code.=<<___;
211	pxor	@b[5], @b[1]
212	pxor	@b[7], @b[2]
213
214	pxor	@b[1], @b[3]
215	pxor	@b[5], @b[4]
216	pxor	@b[5], @b[7]
217	pxor	@b[4], @b[3]
218	 pxor 	@b[0], @b[5]
219	pxor	@b[7], @b[3]
220	 pxor	@b[2], @b[6]
221	 pxor	@b[1], @b[2]
222	pxor	@b[3], @b[6]
223
224	pxor	@b[0], @b[3]
225	pxor	@b[6], @b[5]
226___
227}
228
229sub Mul_GF4 {
230#;*************************************************************
231#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
232#;*************************************************************
233my ($x0,$x1,$y0,$y1,$t0)=@_;
234$code.=<<___;
235	movdqa	$y0, $t0
236	pxor 	$y1, $t0
237	pand	$x0, $t0
238	pxor	$x1, $x0
239	pand	$y0, $x1
240	pand	$y1, $x0
241	pxor	$x1, $x0
242	pxor	$t0, $x1
243___
244}
245
246sub Mul_GF4_N {				# not used, see next subroutine
247# multiply and scale by N
248my ($x0,$x1,$y0,$y1,$t0)=@_;
249$code.=<<___;
250	movdqa	$y0, $t0
251	pxor	$y1, $t0
252	pand	$x0, $t0
253	pxor	$x1, $x0
254	pand	$y0, $x1
255	pand	$y1, $x0
256	pxor	$x0, $x1
257	pxor	$t0, $x0
258___
259}
260
261sub Mul_GF4_N_GF4 {
262# interleaved Mul_GF4_N and Mul_GF4
263my ($x0,$x1,$y0,$y1,$t0,
264    $x2,$x3,$y2,$y3,$t1)=@_;
265$code.=<<___;
266	movdqa	$y0, $t0
267	 movdqa	$y2, $t1
268	pxor	$y1, $t0
269	 pxor 	$y3, $t1
270	pand	$x0, $t0
271	 pand	$x2, $t1
272	pxor	$x1, $x0
273	 pxor	$x3, $x2
274	pand	$y0, $x1
275	 pand	$y2, $x3
276	pand	$y1, $x0
277	 pand	$y3, $x2
278	pxor	$x0, $x1
279	 pxor	$x3, $x2
280	pxor	$t0, $x0
281	 pxor	$t1, $x3
282___
283}
284sub Mul_GF16_2 {
285my @x=@_[0..7];
286my @y=@_[8..11];
287my @t=@_[12..15];
288$code.=<<___;
289	movdqa	@x[0], @t[0]
290	movdqa	@x[1], @t[1]
291___
292	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
293$code.=<<___;
294	pxor	@x[2], @t[0]
295	pxor	@x[3], @t[1]
296	pxor	@y[2], @y[0]
297	pxor	@y[3], @y[1]
298___
299	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
300			 @x[2], @x[3], @y[2], @y[3], @t[2]);
301$code.=<<___;
302	pxor	@t[0], @x[0]
303	pxor	@t[0], @x[2]
304	pxor	@t[1], @x[1]
305	pxor	@t[1], @x[3]
306
307	movdqa	@x[4], @t[0]
308	movdqa	@x[5], @t[1]
309	pxor	@x[6], @t[0]
310	pxor	@x[7], @t[1]
311___
312	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
313			 @x[6], @x[7], @y[2], @y[3], @t[2]);
314$code.=<<___;
315	pxor	@y[2], @y[0]
316	pxor	@y[3], @y[1]
317___
318	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
319$code.=<<___;
320	pxor	@t[0], @x[4]
321	pxor	@t[0], @x[6]
322	pxor	@t[1], @x[5]
323	pxor	@t[1], @x[7]
324___
325}
326sub Inv_GF256 {
327#;********************************************************************
328#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
329#;********************************************************************
330my @x=@_[0..7];
331my @t=@_[8..11];
332my @s=@_[12..15];
333# direct optimizations from hardware
334$code.=<<___;
335	movdqa	@x[4], @t[3]
336	movdqa	@x[5], @t[2]
337	movdqa	@x[1], @t[1]
338	movdqa	@x[7], @s[1]
339	movdqa	@x[0], @s[0]
340
341	pxor	@x[6], @t[3]
342	pxor	@x[7], @t[2]
343	pxor	@x[3], @t[1]
344	 movdqa	@t[3], @s[2]
345	pxor	@x[6], @s[1]
346	 movdqa	@t[2], @t[0]
347	pxor	@x[2], @s[0]
348	 movdqa	@t[3], @s[3]
349
350	por	@t[1], @t[2]
351	por	@s[0], @t[3]
352	pxor	@t[0], @s[3]
353	pand	@s[0], @s[2]
354	pxor	@t[1], @s[0]
355	pand	@t[1], @t[0]
356	pand	@s[0], @s[3]
357	movdqa	@x[3], @s[0]
358	pxor	@x[2], @s[0]
359	pand	@s[0], @s[1]
360	pxor	@s[1], @t[3]
361	pxor	@s[1], @t[2]
362	movdqa	@x[4], @s[1]
363	movdqa	@x[1], @s[0]
364	pxor	@x[5], @s[1]
365	pxor	@x[0], @s[0]
366	movdqa	@s[1], @t[1]
367	pand	@s[0], @s[1]
368	por	@s[0], @t[1]
369	pxor	@s[1], @t[0]
370	pxor	@s[3], @t[3]
371	pxor	@s[2], @t[2]
372	pxor	@s[3], @t[1]
373	movdqa	@x[7], @s[0]
374	pxor	@s[2], @t[0]
375	movdqa	@x[6], @s[1]
376	pxor	@s[2], @t[1]
377	movdqa	@x[5], @s[2]
378	pand	@x[3], @s[0]
379	movdqa	@x[4], @s[3]
380	pand	@x[2], @s[1]
381	pand	@x[1], @s[2]
382	por	@x[0], @s[3]
383	pxor	@s[0], @t[3]
384	pxor	@s[1], @t[2]
385	pxor	@s[2], @t[1]
386	pxor	@s[3], @t[0]
387
388	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
389
390	# new smaller inversion
391
392	movdqa	@t[3], @s[0]
393	pand	@t[1], @t[3]
394	pxor	@t[2], @s[0]
395
396	movdqa	@t[0], @s[2]
397	movdqa	@s[0], @s[3]
398	pxor	@t[3], @s[2]
399	pand	@s[2], @s[3]
400
401	movdqa	@t[1], @s[1]
402	pxor	@t[2], @s[3]
403	pxor	@t[0], @s[1]
404
405	pxor	@t[2], @t[3]
406
407	pand	@t[3], @s[1]
408
409	movdqa	@s[2], @t[2]
410	pxor	@t[0], @s[1]
411
412	pxor	@s[1], @t[2]
413	pxor	@s[1], @t[1]
414
415	pand	@t[0], @t[2]
416
417	pxor	@t[2], @s[2]
418	pxor	@t[2], @t[1]
419
420	pand	@s[3], @s[2]
421
422	pxor	@s[0], @s[2]
423___
424# output in s3, s2, s1, t1
425
426# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
427
428# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
429	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
430
431### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
432}
433
434# AES linear components
435
436sub ShiftRows {
437my @x=@_[0..7];
438my $mask=pop;
439$code.=<<___;
440	pxor	0x00($key),@x[0]
441	pxor	0x10($key),@x[1]
442	pxor	0x20($key),@x[2]
443	pxor	0x30($key),@x[3]
444	pshufb	$mask,@x[0]
445	pshufb	$mask,@x[1]
446	pxor	0x40($key),@x[4]
447	pxor	0x50($key),@x[5]
448	pshufb	$mask,@x[2]
449	pshufb	$mask,@x[3]
450	pxor	0x60($key),@x[6]
451	pxor	0x70($key),@x[7]
452	pshufb	$mask,@x[4]
453	pshufb	$mask,@x[5]
454	pshufb	$mask,@x[6]
455	pshufb	$mask,@x[7]
456	lea	0x80($key),$key
457___
458}
459
460sub MixColumns {
461# modified to emit output in order suitable for feeding back to aesenc[last]
462my @x=@_[0..7];
463my @t=@_[8..15];
464my $inv=@_[16];	# optional
465$code.=<<___;
466	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
467	pshufd	\$0x93, @x[1], @t[1]
468	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
469	pshufd	\$0x93, @x[2], @t[2]
470	 pxor	@t[1], @x[1]
471	pshufd	\$0x93, @x[3], @t[3]
472	 pxor	@t[2], @x[2]
473	pshufd	\$0x93, @x[4], @t[4]
474	 pxor	@t[3], @x[3]
475	pshufd	\$0x93, @x[5], @t[5]
476	 pxor	@t[4], @x[4]
477	pshufd	\$0x93, @x[6], @t[6]
478	 pxor	@t[5], @x[5]
479	pshufd	\$0x93, @x[7], @t[7]
480	 pxor	@t[6], @x[6]
481	 pxor	@t[7], @x[7]
482
483	pxor	@x[0], @t[1]
484	pxor	@x[7], @t[0]
485	pxor	@x[7], @t[1]
486	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
487	pxor	@x[1], @t[2]
488	 pshufd	\$0x4E, @x[1], @x[1]
489	pxor	@x[4], @t[5]
490	 pxor	@t[0], @x[0]
491	pxor	@x[5], @t[6]
492	 pxor	@t[1], @x[1]
493	pxor	@x[3], @t[4]
494	 pshufd	\$0x4E, @x[4], @t[0]
495	pxor	@x[6], @t[7]
496	 pshufd	\$0x4E, @x[5], @t[1]
497	pxor	@x[2], @t[3]
498	 pshufd	\$0x4E, @x[3], @x[4]
499	pxor	@x[7], @t[3]
500	 pshufd	\$0x4E, @x[7], @x[5]
501	pxor	@x[7], @t[4]
502	 pshufd	\$0x4E, @x[6], @x[3]
503	pxor	@t[4], @t[0]
504	 pshufd	\$0x4E, @x[2], @x[6]
505	pxor	@t[5], @t[1]
506___
507$code.=<<___ if (!$inv);
508	pxor	@t[3], @x[4]
509	pxor	@t[7], @x[5]
510	pxor	@t[6], @x[3]
511	 movdqa	@t[0], @x[2]
512	pxor	@t[2], @x[6]
513	 movdqa	@t[1], @x[7]
514___
515$code.=<<___ if ($inv);
516	pxor	@x[4], @t[3]
517	pxor	@t[7], @x[5]
518	pxor	@x[3], @t[6]
519	 movdqa	@t[0], @x[3]
520	pxor	@t[2], @x[6]
521	 movdqa	@t[6], @x[2]
522	 movdqa	@t[1], @x[7]
523	 movdqa	@x[6], @x[4]
524	 movdqa	@t[3], @x[6]
525___
526}
527
528sub InvMixColumns_orig {
529my @x=@_[0..7];
530my @t=@_[8..15];
531
532$code.=<<___;
533	# multiplication by 0x0e
534	pshufd	\$0x93, @x[7], @t[7]
535	movdqa	@x[2], @t[2]
536	pxor	@x[5], @x[7]		# 7 5
537	pxor	@x[5], @x[2]		# 2 5
538	pshufd	\$0x93, @x[0], @t[0]
539	movdqa	@x[5], @t[5]
540	pxor	@x[0], @x[5]		# 5 0		[1]
541	pxor	@x[1], @x[0]		# 0 1
542	pshufd	\$0x93, @x[1], @t[1]
543	pxor	@x[2], @x[1]		# 1 25
544	pxor	@x[6], @x[0]		# 01 6		[2]
545	pxor	@x[3], @x[1]		# 125 3		[4]
546	pshufd	\$0x93, @x[3], @t[3]
547	pxor	@x[0], @x[2]		# 25 016	[3]
548	pxor	@x[7], @x[3]		# 3 75
549	pxor	@x[6], @x[7]		# 75 6		[0]
550	pshufd	\$0x93, @x[6], @t[6]
551	movdqa	@x[4], @t[4]
552	pxor	@x[4], @x[6]		# 6 4
553	pxor	@x[3], @x[4]		# 4 375		[6]
554	pxor	@x[7], @x[3]		# 375 756=36
555	pxor	@t[5], @x[6]		# 64 5		[7]
556	pxor	@t[2], @x[3]		# 36 2
557	pxor	@t[4], @x[3]		# 362 4		[5]
558	pshufd	\$0x93, @t[5], @t[5]
559___
560					my @y = @x[7,5,0,2,1,3,4,6];
561$code.=<<___;
562	# multiplication by 0x0b
563	pxor	@y[0], @y[1]
564	pxor	@t[0], @y[0]
565	pxor	@t[1], @y[1]
566	pshufd	\$0x93, @t[2], @t[2]
567	pxor	@t[5], @y[0]
568	pxor	@t[6], @y[1]
569	pxor	@t[7], @y[0]
570	pshufd	\$0x93, @t[4], @t[4]
571	pxor	@t[6], @t[7]		# clobber t[7]
572	pxor	@y[0], @y[1]
573
574	pxor	@t[0], @y[3]
575	pshufd	\$0x93, @t[0], @t[0]
576	pxor	@t[1], @y[2]
577	pxor	@t[1], @y[4]
578	pxor	@t[2], @y[2]
579	pshufd	\$0x93, @t[1], @t[1]
580	pxor	@t[2], @y[3]
581	pxor	@t[2], @y[5]
582	pxor	@t[7], @y[2]
583	pshufd	\$0x93, @t[2], @t[2]
584	pxor	@t[3], @y[3]
585	pxor	@t[3], @y[6]
586	pxor	@t[3], @y[4]
587	pshufd	\$0x93, @t[3], @t[3]
588	pxor	@t[4], @y[7]
589	pxor	@t[4], @y[5]
590	pxor	@t[7], @y[7]
591	pxor	@t[5], @y[3]
592	pxor	@t[4], @y[4]
593	pxor	@t[5], @t[7]		# clobber t[7] even more
594
595	pxor	@t[7], @y[5]
596	pshufd	\$0x93, @t[4], @t[4]
597	pxor	@t[7], @y[6]
598	pxor	@t[7], @y[4]
599
600	pxor	@t[5], @t[7]
601	pshufd	\$0x93, @t[5], @t[5]
602	pxor	@t[6], @t[7]		# restore t[7]
603
604	# multiplication by 0x0d
605	pxor	@y[7], @y[4]
606	pxor	@t[4], @y[7]
607	pshufd	\$0x93, @t[6], @t[6]
608	pxor	@t[0], @y[2]
609	pxor	@t[5], @y[7]
610	pxor	@t[2], @y[2]
611	pshufd	\$0x93, @t[7], @t[7]
612
613	pxor	@y[1], @y[3]
614	pxor	@t[1], @y[1]
615	pxor	@t[0], @y[0]
616	pxor	@t[0], @y[3]
617	pxor	@t[5], @y[1]
618	pxor	@t[5], @y[0]
619	pxor	@t[7], @y[1]
620	pshufd	\$0x93, @t[0], @t[0]
621	pxor	@t[6], @y[0]
622	pxor	@y[1], @y[3]
623	pxor	@t[1], @y[4]
624	pshufd	\$0x93, @t[1], @t[1]
625
626	pxor	@t[7], @y[7]
627	pxor	@t[2], @y[4]
628	pxor	@t[2], @y[5]
629	pshufd	\$0x93, @t[2], @t[2]
630	pxor	@t[6], @y[2]
631	pxor	@t[3], @t[6]		# clobber t[6]
632	pxor	@y[7], @y[4]
633	pxor	@t[6], @y[3]
634
635	pxor	@t[6], @y[6]
636	pxor	@t[5], @y[5]
637	pxor	@t[4], @y[6]
638	pshufd	\$0x93, @t[4], @t[4]
639	pxor	@t[6], @y[5]
640	pxor	@t[7], @y[6]
641	pxor	@t[3], @t[6]		# restore t[6]
642
643	pshufd	\$0x93, @t[5], @t[5]
644	pshufd	\$0x93, @t[6], @t[6]
645	pshufd	\$0x93, @t[7], @t[7]
646	pshufd	\$0x93, @t[3], @t[3]
647
648	# multiplication by 0x09
649	pxor	@y[1], @y[4]
650	pxor	@y[1], @t[1]		# t[1]=y[1]
651	pxor	@t[5], @t[0]		# clobber t[0]
652	pxor	@t[5], @t[1]
653	pxor	@t[0], @y[3]
654	pxor	@y[0], @t[0]		# t[0]=y[0]
655	pxor	@t[6], @t[1]
656	pxor	@t[7], @t[6]		# clobber t[6]
657	pxor	@t[1], @y[4]
658	pxor	@t[4], @y[7]
659	pxor	@y[4], @t[4]		# t[4]=y[4]
660	pxor	@t[3], @y[6]
661	pxor	@y[3], @t[3]		# t[3]=y[3]
662	pxor	@t[2], @y[5]
663	pxor	@y[2], @t[2]		# t[2]=y[2]
664	pxor	@t[7], @t[3]
665	pxor	@y[5], @t[5]		# t[5]=y[5]
666	pxor	@t[6], @t[2]
667	pxor	@t[6], @t[5]
668	pxor	@y[6], @t[6]		# t[6]=y[6]
669	pxor	@y[7], @t[7]		# t[7]=y[7]
670
671	movdqa	@t[0],@XMM[0]
672	movdqa	@t[1],@XMM[1]
673	movdqa	@t[2],@XMM[2]
674	movdqa	@t[3],@XMM[3]
675	movdqa	@t[4],@XMM[4]
676	movdqa	@t[5],@XMM[5]
677	movdqa	@t[6],@XMM[6]
678	movdqa	@t[7],@XMM[7]
679___
680}
681
682sub InvMixColumns {
683my @x=@_[0..7];
684my @t=@_[8..15];
685
686# Thanks to Jussi Kivilinna for providing pointer to
687#
688# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
689# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
690# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
691# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
692
693$code.=<<___;
694	# multiplication by 0x05-0x00-0x04-0x00
695	pshufd	\$0x4E, @x[0], @t[0]
696	pshufd	\$0x4E, @x[6], @t[6]
697	pxor	@x[0], @t[0]
698	pshufd	\$0x4E, @x[7], @t[7]
699	pxor	@x[6], @t[6]
700	pshufd	\$0x4E, @x[1], @t[1]
701	pxor	@x[7], @t[7]
702	pshufd	\$0x4E, @x[2], @t[2]
703	pxor	@x[1], @t[1]
704	pshufd	\$0x4E, @x[3], @t[3]
705	pxor	@x[2], @t[2]
706	 pxor	@t[6], @x[0]
707	 pxor	@t[6], @x[1]
708	pshufd	\$0x4E, @x[4], @t[4]
709	pxor	@x[3], @t[3]
710	 pxor	@t[0], @x[2]
711	 pxor	@t[1], @x[3]
712	pshufd	\$0x4E, @x[5], @t[5]
713	pxor	@x[4], @t[4]
714	 pxor	@t[7], @x[1]
715	 pxor	@t[2], @x[4]
716	pxor	@x[5], @t[5]
717
718	 pxor	@t[7], @x[2]
719	 pxor	@t[6], @x[3]
720	 pxor	@t[6], @x[4]
721	 pxor	@t[3], @x[5]
722	 pxor	@t[4], @x[6]
723	 pxor	@t[7], @x[4]
724	 pxor	@t[7], @x[5]
725	 pxor	@t[5], @x[7]
726___
727	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
728}
729
730sub aesenc {				# not used
731my @b=@_[0..7];
732my @t=@_[8..15];
733$code.=<<___;
734	movdqa	0x30($const),@t[0]	# .LSR
735___
736	&ShiftRows	(@b,@t[0]);
737	&Sbox		(@b,@t);
738	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
739}
740
741sub aesenclast {			# not used
742my @b=@_[0..7];
743my @t=@_[8..15];
744$code.=<<___;
745	movdqa	0x40($const),@t[0]	# .LSRM0
746___
747	&ShiftRows	(@b,@t[0]);
748	&Sbox		(@b,@t);
749$code.=<<___
750	pxor	0x00($key),@b[0]
751	pxor	0x10($key),@b[1]
752	pxor	0x20($key),@b[4]
753	pxor	0x30($key),@b[6]
754	pxor	0x40($key),@b[3]
755	pxor	0x50($key),@b[7]
756	pxor	0x60($key),@b[2]
757	pxor	0x70($key),@b[5]
758___
759}
760
761sub swapmove {
762my ($a,$b,$n,$mask,$t)=@_;
763$code.=<<___;
764	movdqa	$b,$t
765	psrlq	\$$n,$b
766	pxor  	$a,$b
767	pand	$mask,$b
768	pxor	$b,$a
769	psllq	\$$n,$b
770	pxor	$t,$b
771___
772}
773sub swapmove2x {
774my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
775$code.=<<___;
776	movdqa	$b0,$t0
777	psrlq	\$$n,$b0
778	 movdqa	$b1,$t1
779	 psrlq	\$$n,$b1
780	pxor  	$a0,$b0
781	 pxor  	$a1,$b1
782	pand	$mask,$b0
783	 pand	$mask,$b1
784	pxor	$b0,$a0
785	psllq	\$$n,$b0
786	 pxor	$b1,$a1
787	 psllq	\$$n,$b1
788	pxor	$t0,$b0
789	 pxor	$t1,$b1
790___
791}
792
793sub bitslice {
794my @x=reverse(@_[0..7]);
795my ($t0,$t1,$t2,$t3)=@_[8..11];
796$code.=<<___;
797	movdqa	0x00($const),$t0	# .LBS0
798	movdqa	0x10($const),$t1	# .LBS1
799___
800	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
801	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
802$code.=<<___;
803	movdqa	0x20($const),$t0	# .LBS2
804___
805	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
806	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
807
808	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
809	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
810}
811
812$code.=<<___;
813.text
814
815.extern	asm_AES_encrypt
816.extern	asm_AES_decrypt
817
818.type	_bsaes_encrypt8,\@abi-omnipotent
819.align	64
820_bsaes_encrypt8:
821.cfi_startproc
822	lea	.LBS0(%rip), $const	# constants table
823
824	movdqa	($key), @XMM[9]		# round 0 key
825	lea	0x10($key), $key
826	movdqa	0x50($const), @XMM[8]	# .LM0SR
827	pxor	@XMM[9], @XMM[0]	# xor with round0 key
828	pxor	@XMM[9], @XMM[1]
829	pxor	@XMM[9], @XMM[2]
830	pxor	@XMM[9], @XMM[3]
831	 pshufb	@XMM[8], @XMM[0]
832	 pshufb	@XMM[8], @XMM[1]
833	pxor	@XMM[9], @XMM[4]
834	pxor	@XMM[9], @XMM[5]
835	 pshufb	@XMM[8], @XMM[2]
836	 pshufb	@XMM[8], @XMM[3]
837	pxor	@XMM[9], @XMM[6]
838	pxor	@XMM[9], @XMM[7]
839	 pshufb	@XMM[8], @XMM[4]
840	 pshufb	@XMM[8], @XMM[5]
841	 pshufb	@XMM[8], @XMM[6]
842	 pshufb	@XMM[8], @XMM[7]
843_bsaes_encrypt8_bitslice:
844___
845	&bitslice	(@XMM[0..7, 8..11]);
846$code.=<<___;
847	dec	$rounds
848	jmp	.Lenc_sbox
849.align	16
850.Lenc_loop:
851___
852	&ShiftRows	(@XMM[0..7, 8]);
853$code.=".Lenc_sbox:\n";
854	&Sbox		(@XMM[0..7, 8..15]);
855$code.=<<___;
856	dec	$rounds
857	jl	.Lenc_done
858___
859	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
860$code.=<<___;
861	movdqa	0x30($const), @XMM[8]	# .LSR
862	jnz	.Lenc_loop
863	movdqa	0x40($const), @XMM[8]	# .LSRM0
864	jmp	.Lenc_loop
865.align	16
866.Lenc_done:
867___
868	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
869	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
870$code.=<<___;
871	movdqa	($key), @XMM[8]		# last round key
872	pxor	@XMM[8], @XMM[4]
873	pxor	@XMM[8], @XMM[6]
874	pxor	@XMM[8], @XMM[3]
875	pxor	@XMM[8], @XMM[7]
876	pxor	@XMM[8], @XMM[2]
877	pxor	@XMM[8], @XMM[5]
878	pxor	@XMM[8], @XMM[0]
879	pxor	@XMM[8], @XMM[1]
880	ret
881.cfi_endproc
882.size	_bsaes_encrypt8,.-_bsaes_encrypt8
883
884.type	_bsaes_decrypt8,\@abi-omnipotent
885.align	64
886_bsaes_decrypt8:
887.cfi_startproc
888	lea	.LBS0(%rip), $const	# constants table
889
890	movdqa	($key), @XMM[9]		# round 0 key
891	lea	0x10($key), $key
892	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
893	pxor	@XMM[9], @XMM[0]	# xor with round0 key
894	pxor	@XMM[9], @XMM[1]
895	pxor	@XMM[9], @XMM[2]
896	pxor	@XMM[9], @XMM[3]
897	 pshufb	@XMM[8], @XMM[0]
898	 pshufb	@XMM[8], @XMM[1]
899	pxor	@XMM[9], @XMM[4]
900	pxor	@XMM[9], @XMM[5]
901	 pshufb	@XMM[8], @XMM[2]
902	 pshufb	@XMM[8], @XMM[3]
903	pxor	@XMM[9], @XMM[6]
904	pxor	@XMM[9], @XMM[7]
905	 pshufb	@XMM[8], @XMM[4]
906	 pshufb	@XMM[8], @XMM[5]
907	 pshufb	@XMM[8], @XMM[6]
908	 pshufb	@XMM[8], @XMM[7]
909___
910	&bitslice	(@XMM[0..7, 8..11]);
911$code.=<<___;
912	dec	$rounds
913	jmp	.Ldec_sbox
914.align	16
915.Ldec_loop:
916___
917	&ShiftRows	(@XMM[0..7, 8]);
918$code.=".Ldec_sbox:\n";
919	&InvSbox	(@XMM[0..7, 8..15]);
920$code.=<<___;
921	dec	$rounds
922	jl	.Ldec_done
923___
924	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
925$code.=<<___;
926	movdqa	-0x10($const), @XMM[8]	# .LISR
927	jnz	.Ldec_loop
928	movdqa	-0x20($const), @XMM[8]	# .LISRM0
929	jmp	.Ldec_loop
930.align	16
931.Ldec_done:
932___
933	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
934$code.=<<___;
935	movdqa	($key), @XMM[8]		# last round key
936	pxor	@XMM[8], @XMM[6]
937	pxor	@XMM[8], @XMM[4]
938	pxor	@XMM[8], @XMM[2]
939	pxor	@XMM[8], @XMM[7]
940	pxor	@XMM[8], @XMM[3]
941	pxor	@XMM[8], @XMM[5]
942	pxor	@XMM[8], @XMM[0]
943	pxor	@XMM[8], @XMM[1]
944	ret
945.cfi_endproc
946.size	_bsaes_decrypt8,.-_bsaes_decrypt8
947___
948}
949{
950my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
951
952sub bitslice_key {
953my @x=reverse(@_[0..7]);
954my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
955
956	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
957$code.=<<___;
958	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
959	movdqa	@x[0], @x[2]
960	movdqa	@x[1], @x[3]
961___
962	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
963
964	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
965$code.=<<___;
966	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
967	movdqa	@x[0], @x[4]
968	movdqa	@x[2], @x[6]
969	movdqa	@x[1], @x[5]
970	movdqa	@x[3], @x[7]
971___
972	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
973	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
974}
975
976$code.=<<___;
977.type	_bsaes_key_convert,\@abi-omnipotent
978.align	16
979_bsaes_key_convert:
980.cfi_startproc
981	lea	.Lmasks(%rip), $const
982	movdqu	($inp), %xmm7		# load round 0 key
983	lea	0x10($inp), $inp
984	movdqa	0x00($const), %xmm0	# 0x01...
985	movdqa	0x10($const), %xmm1	# 0x02...
986	movdqa	0x20($const), %xmm2	# 0x04...
987	movdqa	0x30($const), %xmm3	# 0x08...
988	movdqa	0x40($const), %xmm4	# .LM0
989	pcmpeqd	%xmm5, %xmm5		# .LNOT
990
991	movdqu	($inp), %xmm6		# load round 1 key
992	movdqa	%xmm7, ($out)		# save round 0 key
993	lea	0x10($out), $out
994	dec	$rounds
995	jmp	.Lkey_loop
996.align	16
997.Lkey_loop:
998	pshufb	%xmm4, %xmm6		# .LM0
999
1000	movdqa	%xmm0,	%xmm8
1001	movdqa	%xmm1,	%xmm9
1002
1003	pand	%xmm6,	%xmm8
1004	pand	%xmm6,	%xmm9
1005	movdqa	%xmm2,	%xmm10
1006	pcmpeqb	%xmm0,	%xmm8
1007	psllq	\$4,	%xmm0		# 0x10...
1008	movdqa	%xmm3,	%xmm11
1009	pcmpeqb	%xmm1,	%xmm9
1010	psllq	\$4,	%xmm1		# 0x20...
1011
1012	pand	%xmm6,	%xmm10
1013	pand	%xmm6,	%xmm11
1014	movdqa	%xmm0,	%xmm12
1015	pcmpeqb	%xmm2,	%xmm10
1016	psllq	\$4,	%xmm2		# 0x40...
1017	movdqa	%xmm1,	%xmm13
1018	pcmpeqb	%xmm3,	%xmm11
1019	psllq	\$4,	%xmm3		# 0x80...
1020
1021	movdqa	%xmm2,	%xmm14
1022	movdqa	%xmm3,	%xmm15
1023	 pxor	%xmm5,	%xmm8		# "pnot"
1024	 pxor	%xmm5,	%xmm9
1025
1026	pand	%xmm6,	%xmm12
1027	pand	%xmm6,	%xmm13
1028	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
1029	pcmpeqb	%xmm0,	%xmm12
1030	psrlq	\$4,	%xmm0		# 0x01...
1031	 movdqa	%xmm9, 0x10($out)
1032	pcmpeqb	%xmm1,	%xmm13
1033	psrlq	\$4,	%xmm1		# 0x02...
1034	 lea	0x10($inp), $inp
1035
1036	pand	%xmm6,	%xmm14
1037	pand	%xmm6,	%xmm15
1038	 movdqa	%xmm10, 0x20($out)
1039	pcmpeqb	%xmm2,	%xmm14
1040	psrlq	\$4,	%xmm2		# 0x04...
1041	 movdqa	%xmm11, 0x30($out)
1042	pcmpeqb	%xmm3,	%xmm15
1043	psrlq	\$4,	%xmm3		# 0x08...
1044	 movdqu	($inp), %xmm6		# load next round key
1045
1046	pxor	%xmm5, %xmm13		# "pnot"
1047	pxor	%xmm5, %xmm14
1048	movdqa	%xmm12, 0x40($out)
1049	movdqa	%xmm13, 0x50($out)
1050	movdqa	%xmm14, 0x60($out)
1051	movdqa	%xmm15, 0x70($out)
1052	lea	0x80($out),$out
1053	dec	$rounds
1054	jnz	.Lkey_loop
1055
1056	movdqa	0x50($const), %xmm7	# .L63
1057	#movdqa	%xmm6, ($out)		# don't save last round key
1058	ret
1059.cfi_endproc
1060.size	_bsaes_key_convert,.-_bsaes_key_convert
1061___
1062}
1063
1064if (0 && !$win64) {	# following four functions are unsupported interface
1065			# used for benchmarking...
1066$code.=<<___;
1067.globl	bsaes_enc_key_convert
1068.type	bsaes_enc_key_convert,\@function,2
1069.align	16
1070bsaes_enc_key_convert:
1071	mov	240($inp),%r10d		# pass rounds
1072	mov	$inp,%rcx		# pass key
1073	mov	$out,%rax		# pass key schedule
1074	call	_bsaes_key_convert
1075	pxor	%xmm6,%xmm7		# fix up last round key
1076	movdqa	%xmm7,(%rax)		# save last round key
1077	ret
1078.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1079
1080.globl	bsaes_encrypt_128
1081.type	bsaes_encrypt_128,\@function,4
1082.align	16
1083bsaes_encrypt_128:
1084.Lenc128_loop:
1085	movdqu	0x00($inp), @XMM[0]	# load input
1086	movdqu	0x10($inp), @XMM[1]
1087	movdqu	0x20($inp), @XMM[2]
1088	movdqu	0x30($inp), @XMM[3]
1089	movdqu	0x40($inp), @XMM[4]
1090	movdqu	0x50($inp), @XMM[5]
1091	movdqu	0x60($inp), @XMM[6]
1092	movdqu	0x70($inp), @XMM[7]
1093	mov	$key, %rax		# pass the $key
1094	lea	0x80($inp), $inp
1095	mov	\$10,%r10d
1096
1097	call	_bsaes_encrypt8
1098
1099	movdqu	@XMM[0], 0x00($out)	# write output
1100	movdqu	@XMM[1], 0x10($out)
1101	movdqu	@XMM[4], 0x20($out)
1102	movdqu	@XMM[6], 0x30($out)
1103	movdqu	@XMM[3], 0x40($out)
1104	movdqu	@XMM[7], 0x50($out)
1105	movdqu	@XMM[2], 0x60($out)
1106	movdqu	@XMM[5], 0x70($out)
1107	lea	0x80($out), $out
1108	sub	\$0x80,$len
1109	ja	.Lenc128_loop
1110	ret
1111.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1112
1113.globl	bsaes_dec_key_convert
1114.type	bsaes_dec_key_convert,\@function,2
1115.align	16
1116bsaes_dec_key_convert:
1117	mov	240($inp),%r10d		# pass rounds
1118	mov	$inp,%rcx		# pass key
1119	mov	$out,%rax		# pass key schedule
1120	call	_bsaes_key_convert
1121	pxor	($out),%xmm7		# fix up round 0 key
1122	movdqa	%xmm6,(%rax)		# save last round key
1123	movdqa	%xmm7,($out)
1124	ret
1125.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1126
1127.globl	bsaes_decrypt_128
1128.type	bsaes_decrypt_128,\@function,4
1129.align	16
1130bsaes_decrypt_128:
1131.Ldec128_loop:
1132	movdqu	0x00($inp), @XMM[0]	# load input
1133	movdqu	0x10($inp), @XMM[1]
1134	movdqu	0x20($inp), @XMM[2]
1135	movdqu	0x30($inp), @XMM[3]
1136	movdqu	0x40($inp), @XMM[4]
1137	movdqu	0x50($inp), @XMM[5]
1138	movdqu	0x60($inp), @XMM[6]
1139	movdqu	0x70($inp), @XMM[7]
1140	mov	$key, %rax		# pass the $key
1141	lea	0x80($inp), $inp
1142	mov	\$10,%r10d
1143
1144	call	_bsaes_decrypt8
1145
1146	movdqu	@XMM[0], 0x00($out)	# write output
1147	movdqu	@XMM[1], 0x10($out)
1148	movdqu	@XMM[6], 0x20($out)
1149	movdqu	@XMM[4], 0x30($out)
1150	movdqu	@XMM[2], 0x40($out)
1151	movdqu	@XMM[7], 0x50($out)
1152	movdqu	@XMM[3], 0x60($out)
1153	movdqu	@XMM[5], 0x70($out)
1154	lea	0x80($out), $out
1155	sub	\$0x80,$len
1156	ja	.Ldec128_loop
1157	ret
1158.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1159___
1160}
1161{
1162######################################################################
1163#
1164# OpenSSL interface
1165#
1166my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1167						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1168my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1169
1170if ($ecb) {
1171$code.=<<___;
1172.globl	bsaes_ecb_encrypt_blocks
1173.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1174.align	16
1175bsaes_ecb_encrypt_blocks:
1176.cfi_startproc
1177	mov	%rsp, %rax
1178.Lecb_enc_prologue:
1179	push	%rbp
1180.cfi_push	%rbp
1181	push	%rbx
1182.cfi_push	%rbx
1183	push	%r12
1184.cfi_push	%r12
1185	push	%r13
1186.cfi_push	%r13
1187	push	%r14
1188.cfi_push	%r14
1189	push	%r15
1190.cfi_push	%r15
1191	lea	-0x48(%rsp),%rsp
1192.cfi_adjust_cfa_offset	0x48
1193___
1194$code.=<<___ if ($win64);
1195	lea	-0xa0(%rsp), %rsp
1196	movaps	%xmm6, 0x40(%rsp)
1197	movaps	%xmm7, 0x50(%rsp)
1198	movaps	%xmm8, 0x60(%rsp)
1199	movaps	%xmm9, 0x70(%rsp)
1200	movaps	%xmm10, 0x80(%rsp)
1201	movaps	%xmm11, 0x90(%rsp)
1202	movaps	%xmm12, 0xa0(%rsp)
1203	movaps	%xmm13, 0xb0(%rsp)
1204	movaps	%xmm14, 0xc0(%rsp)
1205	movaps	%xmm15, 0xd0(%rsp)
1206.Lecb_enc_body:
1207___
1208$code.=<<___;
1209	mov	%rsp,%rbp		# backup %rsp
1210.cfi_def_cfa_register	%rbp
1211	mov	240($arg4),%eax		# rounds
1212	mov	$arg1,$inp		# backup arguments
1213	mov	$arg2,$out
1214	mov	$arg3,$len
1215	mov	$arg4,$key
1216	cmp	\$8,$arg3
1217	jb	.Lecb_enc_short
1218
1219	mov	%eax,%ebx		# backup rounds
1220	shl	\$7,%rax		# 128 bytes per inner round key
1221	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1222	sub	%rax,%rsp
1223	mov	%rsp,%rax		# pass key schedule
1224	mov	$key,%rcx		# pass key
1225	mov	%ebx,%r10d		# pass rounds
1226	call	_bsaes_key_convert
1227	pxor	%xmm6,%xmm7		# fix up last round key
1228	movdqa	%xmm7,(%rax)		# save last round key
1229
1230	sub	\$8,$len
1231.Lecb_enc_loop:
1232	movdqu	0x00($inp), @XMM[0]	# load input
1233	movdqu	0x10($inp), @XMM[1]
1234	movdqu	0x20($inp), @XMM[2]
1235	movdqu	0x30($inp), @XMM[3]
1236	movdqu	0x40($inp), @XMM[4]
1237	movdqu	0x50($inp), @XMM[5]
1238	mov	%rsp, %rax		# pass key schedule
1239	movdqu	0x60($inp), @XMM[6]
1240	mov	%ebx,%r10d		# pass rounds
1241	movdqu	0x70($inp), @XMM[7]
1242	lea	0x80($inp), $inp
1243
1244	call	_bsaes_encrypt8
1245
1246	movdqu	@XMM[0], 0x00($out)	# write output
1247	movdqu	@XMM[1], 0x10($out)
1248	movdqu	@XMM[4], 0x20($out)
1249	movdqu	@XMM[6], 0x30($out)
1250	movdqu	@XMM[3], 0x40($out)
1251	movdqu	@XMM[7], 0x50($out)
1252	movdqu	@XMM[2], 0x60($out)
1253	movdqu	@XMM[5], 0x70($out)
1254	lea	0x80($out), $out
1255	sub	\$8,$len
1256	jnc	.Lecb_enc_loop
1257
1258	add	\$8,$len
1259	jz	.Lecb_enc_done
1260
1261	movdqu	0x00($inp), @XMM[0]	# load input
1262	mov	%rsp, %rax		# pass key schedule
1263	mov	%ebx,%r10d		# pass rounds
1264	cmp	\$2,$len
1265	jb	.Lecb_enc_one
1266	movdqu	0x10($inp), @XMM[1]
1267	je	.Lecb_enc_two
1268	movdqu	0x20($inp), @XMM[2]
1269	cmp	\$4,$len
1270	jb	.Lecb_enc_three
1271	movdqu	0x30($inp), @XMM[3]
1272	je	.Lecb_enc_four
1273	movdqu	0x40($inp), @XMM[4]
1274	cmp	\$6,$len
1275	jb	.Lecb_enc_five
1276	movdqu	0x50($inp), @XMM[5]
1277	je	.Lecb_enc_six
1278	movdqu	0x60($inp), @XMM[6]
1279	call	_bsaes_encrypt8
1280	movdqu	@XMM[0], 0x00($out)	# write output
1281	movdqu	@XMM[1], 0x10($out)
1282	movdqu	@XMM[4], 0x20($out)
1283	movdqu	@XMM[6], 0x30($out)
1284	movdqu	@XMM[3], 0x40($out)
1285	movdqu	@XMM[7], 0x50($out)
1286	movdqu	@XMM[2], 0x60($out)
1287	jmp	.Lecb_enc_done
1288.align	16
1289.Lecb_enc_six:
1290	call	_bsaes_encrypt8
1291	movdqu	@XMM[0], 0x00($out)	# write output
1292	movdqu	@XMM[1], 0x10($out)
1293	movdqu	@XMM[4], 0x20($out)
1294	movdqu	@XMM[6], 0x30($out)
1295	movdqu	@XMM[3], 0x40($out)
1296	movdqu	@XMM[7], 0x50($out)
1297	jmp	.Lecb_enc_done
1298.align	16
1299.Lecb_enc_five:
1300	call	_bsaes_encrypt8
1301	movdqu	@XMM[0], 0x00($out)	# write output
1302	movdqu	@XMM[1], 0x10($out)
1303	movdqu	@XMM[4], 0x20($out)
1304	movdqu	@XMM[6], 0x30($out)
1305	movdqu	@XMM[3], 0x40($out)
1306	jmp	.Lecb_enc_done
1307.align	16
1308.Lecb_enc_four:
1309	call	_bsaes_encrypt8
1310	movdqu	@XMM[0], 0x00($out)	# write output
1311	movdqu	@XMM[1], 0x10($out)
1312	movdqu	@XMM[4], 0x20($out)
1313	movdqu	@XMM[6], 0x30($out)
1314	jmp	.Lecb_enc_done
1315.align	16
1316.Lecb_enc_three:
1317	call	_bsaes_encrypt8
1318	movdqu	@XMM[0], 0x00($out)	# write output
1319	movdqu	@XMM[1], 0x10($out)
1320	movdqu	@XMM[4], 0x20($out)
1321	jmp	.Lecb_enc_done
1322.align	16
1323.Lecb_enc_two:
1324	call	_bsaes_encrypt8
1325	movdqu	@XMM[0], 0x00($out)	# write output
1326	movdqu	@XMM[1], 0x10($out)
1327	jmp	.Lecb_enc_done
1328.align	16
1329.Lecb_enc_one:
1330	call	_bsaes_encrypt8
1331	movdqu	@XMM[0], 0x00($out)	# write output
1332	jmp	.Lecb_enc_done
1333.align	16
1334.Lecb_enc_short:
1335	lea	($inp), $arg1
1336	lea	($out), $arg2
1337	lea	($key), $arg3
1338	call	asm_AES_encrypt
1339	lea	16($inp), $inp
1340	lea	16($out), $out
1341	dec	$len
1342	jnz	.Lecb_enc_short
1343
1344.Lecb_enc_done:
1345	lea	(%rsp),%rax
1346	pxor	%xmm0, %xmm0
1347.Lecb_enc_bzero:			# wipe key schedule [if any]
1348	movdqa	%xmm0, 0x00(%rax)
1349	movdqa	%xmm0, 0x10(%rax)
1350	lea	0x20(%rax), %rax
1351	cmp	%rax, %rbp
1352	jb	.Lecb_enc_bzero
1353
1354	lea	0x78(%rbp),%rax
1355.cfi_def_cfa	%rax,8
1356___
1357$code.=<<___ if ($win64);
1358	movaps	0x40(%rbp), %xmm6
1359	movaps	0x50(%rbp), %xmm7
1360	movaps	0x60(%rbp), %xmm8
1361	movaps	0x70(%rbp), %xmm9
1362	movaps	0x80(%rbp), %xmm10
1363	movaps	0x90(%rbp), %xmm11
1364	movaps	0xa0(%rbp), %xmm12
1365	movaps	0xb0(%rbp), %xmm13
1366	movaps	0xc0(%rbp), %xmm14
1367	movaps	0xd0(%rbp), %xmm15
1368	lea	0xa0(%rax), %rax
1369.Lecb_enc_tail:
1370___
1371$code.=<<___;
1372	mov	-48(%rax), %r15
1373.cfi_restore	%r15
1374	mov	-40(%rax), %r14
1375.cfi_restore	%r14
1376	mov	-32(%rax), %r13
1377.cfi_restore	%r13
1378	mov	-24(%rax), %r12
1379.cfi_restore	%r12
1380	mov	-16(%rax), %rbx
1381.cfi_restore	%rbx
1382	mov	-8(%rax), %rbp
1383.cfi_restore	%rbp
1384	lea	(%rax), %rsp		# restore %rsp
1385.cfi_def_cfa_register	%rsp
1386.Lecb_enc_epilogue:
1387	ret
1388.cfi_endproc
1389.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1390
1391.globl	bsaes_ecb_decrypt_blocks
1392.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1393.align	16
1394bsaes_ecb_decrypt_blocks:
1395.cfi_startproc
1396	mov	%rsp, %rax
1397.Lecb_dec_prologue:
1398	push	%rbp
1399.cfi_push	%rbp
1400	push	%rbx
1401.cfi_push	%rbx
1402	push	%r12
1403.cfi_push	%r12
1404	push	%r13
1405.cfi_push	%r13
1406	push	%r14
1407.cfi_push	%r14
1408	push	%r15
1409.cfi_push	%r15
1410	lea	-0x48(%rsp),%rsp
1411.cfi_adjust_cfa_offset	0x48
1412___
1413$code.=<<___ if ($win64);
1414	lea	-0xa0(%rsp), %rsp
1415	movaps	%xmm6, 0x40(%rsp)
1416	movaps	%xmm7, 0x50(%rsp)
1417	movaps	%xmm8, 0x60(%rsp)
1418	movaps	%xmm9, 0x70(%rsp)
1419	movaps	%xmm10, 0x80(%rsp)
1420	movaps	%xmm11, 0x90(%rsp)
1421	movaps	%xmm12, 0xa0(%rsp)
1422	movaps	%xmm13, 0xb0(%rsp)
1423	movaps	%xmm14, 0xc0(%rsp)
1424	movaps	%xmm15, 0xd0(%rsp)
1425.Lecb_dec_body:
1426___
1427$code.=<<___;
1428	mov	%rsp,%rbp		# backup %rsp
1429.cfi_def_cfa_register	%rbp
1430	mov	240($arg4),%eax		# rounds
1431	mov	$arg1,$inp		# backup arguments
1432	mov	$arg2,$out
1433	mov	$arg3,$len
1434	mov	$arg4,$key
1435	cmp	\$8,$arg3
1436	jb	.Lecb_dec_short
1437
1438	mov	%eax,%ebx		# backup rounds
1439	shl	\$7,%rax		# 128 bytes per inner round key
1440	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1441	sub	%rax,%rsp
1442	mov	%rsp,%rax		# pass key schedule
1443	mov	$key,%rcx		# pass key
1444	mov	%ebx,%r10d		# pass rounds
1445	call	_bsaes_key_convert
1446	pxor	(%rsp),%xmm7		# fix up 0 round key
1447	movdqa	%xmm6,(%rax)		# save last round key
1448	movdqa	%xmm7,(%rsp)
1449
1450	sub	\$8,$len
1451.Lecb_dec_loop:
1452	movdqu	0x00($inp), @XMM[0]	# load input
1453	movdqu	0x10($inp), @XMM[1]
1454	movdqu	0x20($inp), @XMM[2]
1455	movdqu	0x30($inp), @XMM[3]
1456	movdqu	0x40($inp), @XMM[4]
1457	movdqu	0x50($inp), @XMM[5]
1458	mov	%rsp, %rax		# pass key schedule
1459	movdqu	0x60($inp), @XMM[6]
1460	mov	%ebx,%r10d		# pass rounds
1461	movdqu	0x70($inp), @XMM[7]
1462	lea	0x80($inp), $inp
1463
1464	call	_bsaes_decrypt8
1465
1466	movdqu	@XMM[0], 0x00($out)	# write output
1467	movdqu	@XMM[1], 0x10($out)
1468	movdqu	@XMM[6], 0x20($out)
1469	movdqu	@XMM[4], 0x30($out)
1470	movdqu	@XMM[2], 0x40($out)
1471	movdqu	@XMM[7], 0x50($out)
1472	movdqu	@XMM[3], 0x60($out)
1473	movdqu	@XMM[5], 0x70($out)
1474	lea	0x80($out), $out
1475	sub	\$8,$len
1476	jnc	.Lecb_dec_loop
1477
1478	add	\$8,$len
1479	jz	.Lecb_dec_done
1480
1481	movdqu	0x00($inp), @XMM[0]	# load input
1482	mov	%rsp, %rax		# pass key schedule
1483	mov	%ebx,%r10d		# pass rounds
1484	cmp	\$2,$len
1485	jb	.Lecb_dec_one
1486	movdqu	0x10($inp), @XMM[1]
1487	je	.Lecb_dec_two
1488	movdqu	0x20($inp), @XMM[2]
1489	cmp	\$4,$len
1490	jb	.Lecb_dec_three
1491	movdqu	0x30($inp), @XMM[3]
1492	je	.Lecb_dec_four
1493	movdqu	0x40($inp), @XMM[4]
1494	cmp	\$6,$len
1495	jb	.Lecb_dec_five
1496	movdqu	0x50($inp), @XMM[5]
1497	je	.Lecb_dec_six
1498	movdqu	0x60($inp), @XMM[6]
1499	call	_bsaes_decrypt8
1500	movdqu	@XMM[0], 0x00($out)	# write output
1501	movdqu	@XMM[1], 0x10($out)
1502	movdqu	@XMM[6], 0x20($out)
1503	movdqu	@XMM[4], 0x30($out)
1504	movdqu	@XMM[2], 0x40($out)
1505	movdqu	@XMM[7], 0x50($out)
1506	movdqu	@XMM[3], 0x60($out)
1507	jmp	.Lecb_dec_done
1508.align	16
1509.Lecb_dec_six:
1510	call	_bsaes_decrypt8
1511	movdqu	@XMM[0], 0x00($out)	# write output
1512	movdqu	@XMM[1], 0x10($out)
1513	movdqu	@XMM[6], 0x20($out)
1514	movdqu	@XMM[4], 0x30($out)
1515	movdqu	@XMM[2], 0x40($out)
1516	movdqu	@XMM[7], 0x50($out)
1517	jmp	.Lecb_dec_done
1518.align	16
1519.Lecb_dec_five:
1520	call	_bsaes_decrypt8
1521	movdqu	@XMM[0], 0x00($out)	# write output
1522	movdqu	@XMM[1], 0x10($out)
1523	movdqu	@XMM[6], 0x20($out)
1524	movdqu	@XMM[4], 0x30($out)
1525	movdqu	@XMM[2], 0x40($out)
1526	jmp	.Lecb_dec_done
1527.align	16
1528.Lecb_dec_four:
1529	call	_bsaes_decrypt8
1530	movdqu	@XMM[0], 0x00($out)	# write output
1531	movdqu	@XMM[1], 0x10($out)
1532	movdqu	@XMM[6], 0x20($out)
1533	movdqu	@XMM[4], 0x30($out)
1534	jmp	.Lecb_dec_done
1535.align	16
1536.Lecb_dec_three:
1537	call	_bsaes_decrypt8
1538	movdqu	@XMM[0], 0x00($out)	# write output
1539	movdqu	@XMM[1], 0x10($out)
1540	movdqu	@XMM[6], 0x20($out)
1541	jmp	.Lecb_dec_done
1542.align	16
1543.Lecb_dec_two:
1544	call	_bsaes_decrypt8
1545	movdqu	@XMM[0], 0x00($out)	# write output
1546	movdqu	@XMM[1], 0x10($out)
1547	jmp	.Lecb_dec_done
1548.align	16
1549.Lecb_dec_one:
1550	call	_bsaes_decrypt8
1551	movdqu	@XMM[0], 0x00($out)	# write output
1552	jmp	.Lecb_dec_done
1553.align	16
1554.Lecb_dec_short:
1555	lea	($inp), $arg1
1556	lea	($out), $arg2
1557	lea	($key), $arg3
1558	call	asm_AES_decrypt
1559	lea	16($inp), $inp
1560	lea	16($out), $out
1561	dec	$len
1562	jnz	.Lecb_dec_short
1563
1564.Lecb_dec_done:
1565	lea	(%rsp),%rax
1566	pxor	%xmm0, %xmm0
1567.Lecb_dec_bzero:			# wipe key schedule [if any]
1568	movdqa	%xmm0, 0x00(%rax)
1569	movdqa	%xmm0, 0x10(%rax)
1570	lea	0x20(%rax), %rax
1571	cmp	%rax, %rbp
1572	jb	.Lecb_dec_bzero
1573
1574	lea	0x78(%rbp),%rax
1575.cfi_def_cfa	%rax,8
1576___
1577$code.=<<___ if ($win64);
1578	movaps	0x40(%rbp), %xmm6
1579	movaps	0x50(%rbp), %xmm7
1580	movaps	0x60(%rbp), %xmm8
1581	movaps	0x70(%rbp), %xmm9
1582	movaps	0x80(%rbp), %xmm10
1583	movaps	0x90(%rbp), %xmm11
1584	movaps	0xa0(%rbp), %xmm12
1585	movaps	0xb0(%rbp), %xmm13
1586	movaps	0xc0(%rbp), %xmm14
1587	movaps	0xd0(%rbp), %xmm15
1588	lea	0xa0(%rax), %rax
1589.Lecb_dec_tail:
1590___
1591$code.=<<___;
1592	mov	-48(%rax), %r15
1593.cfi_restore	%r15
1594	mov	-40(%rax), %r14
1595.cfi_restore	%r14
1596	mov	-32(%rax), %r13
1597.cfi_restore	%r13
1598	mov	-24(%rax), %r12
1599.cfi_restore	%r12
1600	mov	-16(%rax), %rbx
1601.cfi_restore	%rbx
1602	mov	-8(%rax), %rbp
1603.cfi_restore	%rbp
1604	lea	(%rax), %rsp		# restore %rsp
1605.cfi_def_cfa_register	%rsp
1606.Lecb_dec_epilogue:
1607	ret
1608.cfi_endproc
1609.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1610___
1611}
1612$code.=<<___;
1613.extern	asm_AES_cbc_encrypt
1614.globl	ossl_bsaes_cbc_encrypt
1615.type	ossl_bsaes_cbc_encrypt,\@abi-omnipotent
1616.align	16
1617ossl_bsaes_cbc_encrypt:
1618.cfi_startproc
1619	endbranch
1620___
1621$code.=<<___ if ($win64);
1622	mov	48(%rsp),$arg6		# pull direction flag
1623___
1624$code.=<<___;
1625	cmp	\$0,$arg6
1626	jne	asm_AES_cbc_encrypt
1627	cmp	\$128,$arg3
1628	jb	asm_AES_cbc_encrypt
1629
1630	mov	%rsp, %rax
1631.Lcbc_dec_prologue:
1632	push	%rbp
1633.cfi_push	%rbp
1634	push	%rbx
1635.cfi_push	%rbx
1636	push	%r12
1637.cfi_push	%r12
1638	push	%r13
1639.cfi_push	%r13
1640	push	%r14
1641.cfi_push	%r14
1642	push	%r15
1643.cfi_push	%r15
1644	lea	-0x48(%rsp), %rsp
1645.cfi_adjust_cfa_offset	0x48
1646___
1647$code.=<<___ if ($win64);
1648	mov	0xa0(%rsp),$arg5	# pull ivp
1649	lea	-0xa0(%rsp), %rsp
1650	movaps	%xmm6, 0x40(%rsp)
1651	movaps	%xmm7, 0x50(%rsp)
1652	movaps	%xmm8, 0x60(%rsp)
1653	movaps	%xmm9, 0x70(%rsp)
1654	movaps	%xmm10, 0x80(%rsp)
1655	movaps	%xmm11, 0x90(%rsp)
1656	movaps	%xmm12, 0xa0(%rsp)
1657	movaps	%xmm13, 0xb0(%rsp)
1658	movaps	%xmm14, 0xc0(%rsp)
1659	movaps	%xmm15, 0xd0(%rsp)
1660.Lcbc_dec_body:
1661___
1662$code.=<<___;
1663	mov	%rsp, %rbp		# backup %rsp
1664.cfi_def_cfa_register	%rbp
1665	mov	240($arg4), %eax	# rounds
1666	mov	$arg1, $inp		# backup arguments
1667	mov	$arg2, $out
1668	mov	$arg3, $len
1669	mov	$arg4, $key
1670	mov	$arg5, %rbx
1671	shr	\$4, $len		# bytes to blocks
1672
1673	mov	%eax, %edx		# rounds
1674	shl	\$7, %rax		# 128 bytes per inner round key
1675	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1676	sub	%rax, %rsp
1677
1678	mov	%rsp, %rax		# pass key schedule
1679	mov	$key, %rcx		# pass key
1680	mov	%edx, %r10d		# pass rounds
1681	call	_bsaes_key_convert
1682	pxor	(%rsp),%xmm7		# fix up 0 round key
1683	movdqa	%xmm6,(%rax)		# save last round key
1684	movdqa	%xmm7,(%rsp)
1685
1686	movdqu	(%rbx), @XMM[15]	# load IV
1687	sub	\$8,$len
1688.Lcbc_dec_loop:
1689	movdqu	0x00($inp), @XMM[0]	# load input
1690	movdqu	0x10($inp), @XMM[1]
1691	movdqu	0x20($inp), @XMM[2]
1692	movdqu	0x30($inp), @XMM[3]
1693	movdqu	0x40($inp), @XMM[4]
1694	movdqu	0x50($inp), @XMM[5]
1695	mov	%rsp, %rax		# pass key schedule
1696	movdqu	0x60($inp), @XMM[6]
1697	mov	%edx,%r10d		# pass rounds
1698	movdqu	0x70($inp), @XMM[7]
1699	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1700
1701	call	_bsaes_decrypt8
1702
1703	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1704	movdqu	0x00($inp), @XMM[8]	# re-load input
1705	movdqu	0x10($inp), @XMM[9]
1706	pxor	@XMM[8], @XMM[1]
1707	movdqu	0x20($inp), @XMM[10]
1708	pxor	@XMM[9], @XMM[6]
1709	movdqu	0x30($inp), @XMM[11]
1710	pxor	@XMM[10], @XMM[4]
1711	movdqu	0x40($inp), @XMM[12]
1712	pxor	@XMM[11], @XMM[2]
1713	movdqu	0x50($inp), @XMM[13]
1714	pxor	@XMM[12], @XMM[7]
1715	movdqu	0x60($inp), @XMM[14]
1716	pxor	@XMM[13], @XMM[3]
1717	movdqu	0x70($inp), @XMM[15]	# IV
1718	pxor	@XMM[14], @XMM[5]
1719	movdqu	@XMM[0], 0x00($out)	# write output
1720	lea	0x80($inp), $inp
1721	movdqu	@XMM[1], 0x10($out)
1722	movdqu	@XMM[6], 0x20($out)
1723	movdqu	@XMM[4], 0x30($out)
1724	movdqu	@XMM[2], 0x40($out)
1725	movdqu	@XMM[7], 0x50($out)
1726	movdqu	@XMM[3], 0x60($out)
1727	movdqu	@XMM[5], 0x70($out)
1728	lea	0x80($out), $out
1729	sub	\$8,$len
1730	jnc	.Lcbc_dec_loop
1731
1732	add	\$8,$len
1733	jz	.Lcbc_dec_done
1734
1735	movdqu	0x00($inp), @XMM[0]	# load input
1736	mov	%rsp, %rax		# pass key schedule
1737	mov	%edx, %r10d		# pass rounds
1738	cmp	\$2,$len
1739	jb	.Lcbc_dec_one
1740	movdqu	0x10($inp), @XMM[1]
1741	je	.Lcbc_dec_two
1742	movdqu	0x20($inp), @XMM[2]
1743	cmp	\$4,$len
1744	jb	.Lcbc_dec_three
1745	movdqu	0x30($inp), @XMM[3]
1746	je	.Lcbc_dec_four
1747	movdqu	0x40($inp), @XMM[4]
1748	cmp	\$6,$len
1749	jb	.Lcbc_dec_five
1750	movdqu	0x50($inp), @XMM[5]
1751	je	.Lcbc_dec_six
1752	movdqu	0x60($inp), @XMM[6]
1753	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1754	call	_bsaes_decrypt8
1755	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1756	movdqu	0x00($inp), @XMM[8]	# re-load input
1757	movdqu	0x10($inp), @XMM[9]
1758	pxor	@XMM[8], @XMM[1]
1759	movdqu	0x20($inp), @XMM[10]
1760	pxor	@XMM[9], @XMM[6]
1761	movdqu	0x30($inp), @XMM[11]
1762	pxor	@XMM[10], @XMM[4]
1763	movdqu	0x40($inp), @XMM[12]
1764	pxor	@XMM[11], @XMM[2]
1765	movdqu	0x50($inp), @XMM[13]
1766	pxor	@XMM[12], @XMM[7]
1767	movdqu	0x60($inp), @XMM[15]	# IV
1768	pxor	@XMM[13], @XMM[3]
1769	movdqu	@XMM[0], 0x00($out)	# write output
1770	movdqu	@XMM[1], 0x10($out)
1771	movdqu	@XMM[6], 0x20($out)
1772	movdqu	@XMM[4], 0x30($out)
1773	movdqu	@XMM[2], 0x40($out)
1774	movdqu	@XMM[7], 0x50($out)
1775	movdqu	@XMM[3], 0x60($out)
1776	jmp	.Lcbc_dec_done
1777.align	16
1778.Lcbc_dec_six:
1779	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1780	call	_bsaes_decrypt8
1781	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1782	movdqu	0x00($inp), @XMM[8]	# re-load input
1783	movdqu	0x10($inp), @XMM[9]
1784	pxor	@XMM[8], @XMM[1]
1785	movdqu	0x20($inp), @XMM[10]
1786	pxor	@XMM[9], @XMM[6]
1787	movdqu	0x30($inp), @XMM[11]
1788	pxor	@XMM[10], @XMM[4]
1789	movdqu	0x40($inp), @XMM[12]
1790	pxor	@XMM[11], @XMM[2]
1791	movdqu	0x50($inp), @XMM[15]	# IV
1792	pxor	@XMM[12], @XMM[7]
1793	movdqu	@XMM[0], 0x00($out)	# write output
1794	movdqu	@XMM[1], 0x10($out)
1795	movdqu	@XMM[6], 0x20($out)
1796	movdqu	@XMM[4], 0x30($out)
1797	movdqu	@XMM[2], 0x40($out)
1798	movdqu	@XMM[7], 0x50($out)
1799	jmp	.Lcbc_dec_done
1800.align	16
1801.Lcbc_dec_five:
1802	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1803	call	_bsaes_decrypt8
1804	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1805	movdqu	0x00($inp), @XMM[8]	# re-load input
1806	movdqu	0x10($inp), @XMM[9]
1807	pxor	@XMM[8], @XMM[1]
1808	movdqu	0x20($inp), @XMM[10]
1809	pxor	@XMM[9], @XMM[6]
1810	movdqu	0x30($inp), @XMM[11]
1811	pxor	@XMM[10], @XMM[4]
1812	movdqu	0x40($inp), @XMM[15]	# IV
1813	pxor	@XMM[11], @XMM[2]
1814	movdqu	@XMM[0], 0x00($out)	# write output
1815	movdqu	@XMM[1], 0x10($out)
1816	movdqu	@XMM[6], 0x20($out)
1817	movdqu	@XMM[4], 0x30($out)
1818	movdqu	@XMM[2], 0x40($out)
1819	jmp	.Lcbc_dec_done
1820.align	16
1821.Lcbc_dec_four:
1822	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1823	call	_bsaes_decrypt8
1824	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1825	movdqu	0x00($inp), @XMM[8]	# re-load input
1826	movdqu	0x10($inp), @XMM[9]
1827	pxor	@XMM[8], @XMM[1]
1828	movdqu	0x20($inp), @XMM[10]
1829	pxor	@XMM[9], @XMM[6]
1830	movdqu	0x30($inp), @XMM[15]	# IV
1831	pxor	@XMM[10], @XMM[4]
1832	movdqu	@XMM[0], 0x00($out)	# write output
1833	movdqu	@XMM[1], 0x10($out)
1834	movdqu	@XMM[6], 0x20($out)
1835	movdqu	@XMM[4], 0x30($out)
1836	jmp	.Lcbc_dec_done
1837.align	16
1838.Lcbc_dec_three:
1839	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1840	call	_bsaes_decrypt8
1841	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1842	movdqu	0x00($inp), @XMM[8]	# re-load input
1843	movdqu	0x10($inp), @XMM[9]
1844	pxor	@XMM[8], @XMM[1]
1845	movdqu	0x20($inp), @XMM[15]	# IV
1846	pxor	@XMM[9], @XMM[6]
1847	movdqu	@XMM[0], 0x00($out)	# write output
1848	movdqu	@XMM[1], 0x10($out)
1849	movdqu	@XMM[6], 0x20($out)
1850	jmp	.Lcbc_dec_done
1851.align	16
1852.Lcbc_dec_two:
1853	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1854	call	_bsaes_decrypt8
1855	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1856	movdqu	0x00($inp), @XMM[8]	# re-load input
1857	movdqu	0x10($inp), @XMM[15]	# IV
1858	pxor	@XMM[8], @XMM[1]
1859	movdqu	@XMM[0], 0x00($out)	# write output
1860	movdqu	@XMM[1], 0x10($out)
1861	jmp	.Lcbc_dec_done
1862.align	16
1863.Lcbc_dec_one:
1864	lea	($inp), $arg1
1865	lea	0x20(%rbp), $arg2	# buffer output
1866	lea	($key), $arg3
1867	call	asm_AES_decrypt		# doesn't touch %xmm
1868	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1869	movdqu	@XMM[15], ($out)	# write output
1870	movdqa	@XMM[0], @XMM[15]	# IV
1871
1872.Lcbc_dec_done:
1873	movdqu	@XMM[15], (%rbx)	# return IV
1874	lea	(%rsp), %rax
1875	pxor	%xmm0, %xmm0
1876.Lcbc_dec_bzero:			# wipe key schedule [if any]
1877	movdqa	%xmm0, 0x00(%rax)
1878	movdqa	%xmm0, 0x10(%rax)
1879	lea	0x20(%rax), %rax
1880	cmp	%rax, %rbp
1881	ja	.Lcbc_dec_bzero
1882
1883	lea	0x78(%rbp),%rax
1884.cfi_def_cfa	%rax,8
1885___
1886$code.=<<___ if ($win64);
1887	movaps	0x40(%rbp), %xmm6
1888	movaps	0x50(%rbp), %xmm7
1889	movaps	0x60(%rbp), %xmm8
1890	movaps	0x70(%rbp), %xmm9
1891	movaps	0x80(%rbp), %xmm10
1892	movaps	0x90(%rbp), %xmm11
1893	movaps	0xa0(%rbp), %xmm12
1894	movaps	0xb0(%rbp), %xmm13
1895	movaps	0xc0(%rbp), %xmm14
1896	movaps	0xd0(%rbp), %xmm15
1897	lea	0xa0(%rax), %rax
1898.Lcbc_dec_tail:
1899___
1900$code.=<<___;
1901	mov	-48(%rax), %r15
1902.cfi_restore	%r15
1903	mov	-40(%rax), %r14
1904.cfi_restore	%r14
1905	mov	-32(%rax), %r13
1906.cfi_restore	%r13
1907	mov	-24(%rax), %r12
1908.cfi_restore	%r12
1909	mov	-16(%rax), %rbx
1910.cfi_restore	%rbx
1911	mov	-8(%rax), %rbp
1912.cfi_restore	%rbp
1913	lea	(%rax), %rsp		# restore %rsp
1914.cfi_def_cfa_register	%rsp
1915.Lcbc_dec_epilogue:
1916	ret
1917.cfi_endproc
1918.size	ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
1919
1920.globl	ossl_bsaes_ctr32_encrypt_blocks
1921.type	ossl_bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1922.align	16
1923ossl_bsaes_ctr32_encrypt_blocks:
1924.cfi_startproc
1925	endbranch
1926	mov	%rsp, %rax
1927.Lctr_enc_prologue:
1928	push	%rbp
1929.cfi_push	%rbp
1930	push	%rbx
1931.cfi_push	%rbx
1932	push	%r12
1933.cfi_push	%r12
1934	push	%r13
1935.cfi_push	%r13
1936	push	%r14
1937.cfi_push	%r14
1938	push	%r15
1939.cfi_push	%r15
1940	lea	-0x48(%rsp), %rsp
1941.cfi_adjust_cfa_offset	0x48
1942___
1943$code.=<<___ if ($win64);
1944	mov	0xa0(%rsp),$arg5	# pull ivp
1945	lea	-0xa0(%rsp), %rsp
1946	movaps	%xmm6, 0x40(%rsp)
1947	movaps	%xmm7, 0x50(%rsp)
1948	movaps	%xmm8, 0x60(%rsp)
1949	movaps	%xmm9, 0x70(%rsp)
1950	movaps	%xmm10, 0x80(%rsp)
1951	movaps	%xmm11, 0x90(%rsp)
1952	movaps	%xmm12, 0xa0(%rsp)
1953	movaps	%xmm13, 0xb0(%rsp)
1954	movaps	%xmm14, 0xc0(%rsp)
1955	movaps	%xmm15, 0xd0(%rsp)
1956.Lctr_enc_body:
1957___
1958$code.=<<___;
1959	mov	%rsp, %rbp		# backup %rsp
1960.cfi_def_cfa_register	%rbp
1961	movdqu	($arg5), %xmm0		# load counter
1962	mov	240($arg4), %eax	# rounds
1963	mov	$arg1, $inp		# backup arguments
1964	mov	$arg2, $out
1965	mov	$arg3, $len
1966	mov	$arg4, $key
1967	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1968	cmp	\$8, $arg3
1969	jb	.Lctr_enc_short
1970
1971	mov	%eax, %ebx		# rounds
1972	shl	\$7, %rax		# 128 bytes per inner round key
1973	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1974	sub	%rax, %rsp
1975
1976	mov	%rsp, %rax		# pass key schedule
1977	mov	$key, %rcx		# pass key
1978	mov	%ebx, %r10d		# pass rounds
1979	call	_bsaes_key_convert
1980	pxor	%xmm6,%xmm7		# fix up last round key
1981	movdqa	%xmm7,(%rax)		# save last round key
1982
1983	movdqa	(%rsp), @XMM[9]		# load round0 key
1984	lea	.LADD1(%rip), %r11
1985	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1986	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1987	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1988	pshufb	@XMM[8], @XMM[0]
1989	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1990	jmp	.Lctr_enc_loop
1991.align	16
1992.Lctr_enc_loop:
1993	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1994	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1995	movdqa	@XMM[0], @XMM[2]
1996	paddd	0x00(%r11), @XMM[1]	# .LADD1
1997	movdqa	@XMM[0], @XMM[3]
1998	paddd	0x10(%r11), @XMM[2]	# .LADD2
1999	movdqa	@XMM[0], @XMM[4]
2000	paddd	0x20(%r11), @XMM[3]	# .LADD3
2001	movdqa	@XMM[0], @XMM[5]
2002	paddd	0x30(%r11), @XMM[4]	# .LADD4
2003	movdqa	@XMM[0], @XMM[6]
2004	paddd	0x40(%r11), @XMM[5]	# .LADD5
2005	movdqa	@XMM[0], @XMM[7]
2006	paddd	0x50(%r11), @XMM[6]	# .LADD6
2007	paddd	0x60(%r11), @XMM[7]	# .LADD7
2008
2009	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
2010	# to flip byte order in 32-bit counter
2011	movdqa	(%rsp), @XMM[9]		# round 0 key
2012	lea	0x10(%rsp), %rax	# pass key schedule
2013	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
2014	pxor	@XMM[9], @XMM[0]	# xor with round0 key
2015	pxor	@XMM[9], @XMM[1]
2016	pxor	@XMM[9], @XMM[2]
2017	pxor	@XMM[9], @XMM[3]
2018	 pshufb	@XMM[8], @XMM[0]
2019	 pshufb	@XMM[8], @XMM[1]
2020	pxor	@XMM[9], @XMM[4]
2021	pxor	@XMM[9], @XMM[5]
2022	 pshufb	@XMM[8], @XMM[2]
2023	 pshufb	@XMM[8], @XMM[3]
2024	pxor	@XMM[9], @XMM[6]
2025	pxor	@XMM[9], @XMM[7]
2026	 pshufb	@XMM[8], @XMM[4]
2027	 pshufb	@XMM[8], @XMM[5]
2028	 pshufb	@XMM[8], @XMM[6]
2029	 pshufb	@XMM[8], @XMM[7]
2030	lea	.LBS0(%rip), %r11	# constants table
2031	mov	%ebx,%r10d		# pass rounds
2032
2033	call	_bsaes_encrypt8_bitslice
2034
2035	sub	\$8,$len
2036	jc	.Lctr_enc_loop_done
2037
2038	movdqu	0x00($inp), @XMM[8]	# load input
2039	movdqu	0x10($inp), @XMM[9]
2040	movdqu	0x20($inp), @XMM[10]
2041	movdqu	0x30($inp), @XMM[11]
2042	movdqu	0x40($inp), @XMM[12]
2043	movdqu	0x50($inp), @XMM[13]
2044	movdqu	0x60($inp), @XMM[14]
2045	movdqu	0x70($inp), @XMM[15]
2046	lea	0x80($inp),$inp
2047	pxor	@XMM[0], @XMM[8]
2048	movdqa	0x20(%rbp), @XMM[0]	# load counter
2049	pxor	@XMM[9], @XMM[1]
2050	movdqu	@XMM[8], 0x00($out)	# write output
2051	pxor	@XMM[10], @XMM[4]
2052	movdqu	@XMM[1], 0x10($out)
2053	pxor	@XMM[11], @XMM[6]
2054	movdqu	@XMM[4], 0x20($out)
2055	pxor	@XMM[12], @XMM[3]
2056	movdqu	@XMM[6], 0x30($out)
2057	pxor	@XMM[13], @XMM[7]
2058	movdqu	@XMM[3], 0x40($out)
2059	pxor	@XMM[14], @XMM[2]
2060	movdqu	@XMM[7], 0x50($out)
2061	pxor	@XMM[15], @XMM[5]
2062	movdqu	@XMM[2], 0x60($out)
2063	lea	.LADD1(%rip), %r11
2064	movdqu	@XMM[5], 0x70($out)
2065	lea	0x80($out), $out
2066	paddd	0x70(%r11), @XMM[0]	# .LADD8
2067	jnz	.Lctr_enc_loop
2068
2069	jmp	.Lctr_enc_done
2070.align	16
2071.Lctr_enc_loop_done:
2072	add	\$8, $len
2073	movdqu	0x00($inp), @XMM[8]	# load input
2074	pxor	@XMM[8], @XMM[0]
2075	movdqu	@XMM[0], 0x00($out)	# write output
2076	cmp	\$2,$len
2077	jb	.Lctr_enc_done
2078	movdqu	0x10($inp), @XMM[9]
2079	pxor	@XMM[9], @XMM[1]
2080	movdqu	@XMM[1], 0x10($out)
2081	je	.Lctr_enc_done
2082	movdqu	0x20($inp), @XMM[10]
2083	pxor	@XMM[10], @XMM[4]
2084	movdqu	@XMM[4], 0x20($out)
2085	cmp	\$4,$len
2086	jb	.Lctr_enc_done
2087	movdqu	0x30($inp), @XMM[11]
2088	pxor	@XMM[11], @XMM[6]
2089	movdqu	@XMM[6], 0x30($out)
2090	je	.Lctr_enc_done
2091	movdqu	0x40($inp), @XMM[12]
2092	pxor	@XMM[12], @XMM[3]
2093	movdqu	@XMM[3], 0x40($out)
2094	cmp	\$6,$len
2095	jb	.Lctr_enc_done
2096	movdqu	0x50($inp), @XMM[13]
2097	pxor	@XMM[13], @XMM[7]
2098	movdqu	@XMM[7], 0x50($out)
2099	je	.Lctr_enc_done
2100	movdqu	0x60($inp), @XMM[14]
2101	pxor	@XMM[14], @XMM[2]
2102	movdqu	@XMM[2], 0x60($out)
2103	jmp	.Lctr_enc_done
2104
2105.align	16
2106.Lctr_enc_short:
2107	lea	0x20(%rbp), $arg1
2108	lea	0x30(%rbp), $arg2
2109	lea	($key), $arg3
2110	call	asm_AES_encrypt
2111	movdqu	($inp), @XMM[1]
2112	lea	16($inp), $inp
2113	mov	0x2c(%rbp), %eax	# load 32-bit counter
2114	bswap	%eax
2115	pxor	0x30(%rbp), @XMM[1]
2116	inc	%eax			# increment
2117	movdqu	@XMM[1], ($out)
2118	bswap	%eax
2119	lea	16($out), $out
2120	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
2121	dec	$len
2122	jnz	.Lctr_enc_short
2123
2124.Lctr_enc_done:
2125	lea	(%rsp), %rax
2126	pxor	%xmm0, %xmm0
2127.Lctr_enc_bzero:			# wipe key schedule [if any]
2128	movdqa	%xmm0, 0x00(%rax)
2129	movdqa	%xmm0, 0x10(%rax)
2130	lea	0x20(%rax), %rax
2131	cmp	%rax, %rbp
2132	ja	.Lctr_enc_bzero
2133
2134	lea	0x78(%rbp),%rax
2135.cfi_def_cfa	%rax,8
2136___
2137$code.=<<___ if ($win64);
2138	movaps	0x40(%rbp), %xmm6
2139	movaps	0x50(%rbp), %xmm7
2140	movaps	0x60(%rbp), %xmm8
2141	movaps	0x70(%rbp), %xmm9
2142	movaps	0x80(%rbp), %xmm10
2143	movaps	0x90(%rbp), %xmm11
2144	movaps	0xa0(%rbp), %xmm12
2145	movaps	0xb0(%rbp), %xmm13
2146	movaps	0xc0(%rbp), %xmm14
2147	movaps	0xd0(%rbp), %xmm15
2148	lea	0xa0(%rax), %rax
2149.Lctr_enc_tail:
2150___
2151$code.=<<___;
2152	mov	-48(%rax), %r15
2153.cfi_restore	%r15
2154	mov	-40(%rax), %r14
2155.cfi_restore	%r14
2156	mov	-32(%rax), %r13
2157.cfi_restore	%r13
2158	mov	-24(%rax), %r12
2159.cfi_restore	%r12
2160	mov	-16(%rax), %rbx
2161.cfi_restore	%rbx
2162	mov	-8(%rax), %rbp
2163.cfi_restore	%rbp
2164	lea	(%rax), %rsp		# restore %rsp
2165.cfi_def_cfa_register	%rsp
2166.Lctr_enc_epilogue:
2167	ret
2168.cfi_endproc
2169.size	ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
2170___
2171######################################################################
2172# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2173#	const AES_KEY *key1, const AES_KEY *key2,
2174#	const unsigned char iv[16]);
2175#
2176my ($twmask,$twres,$twtmp)=@XMM[13..15];
2177$arg6=~s/d$//;
2178
2179$code.=<<___;
2180.globl	ossl_bsaes_xts_encrypt
2181.type	ossl_bsaes_xts_encrypt,\@abi-omnipotent
2182.align	16
2183ossl_bsaes_xts_encrypt:
2184.cfi_startproc
2185	mov	%rsp, %rax
2186.Lxts_enc_prologue:
2187	push	%rbp
2188.cfi_push	%rbp
2189	push	%rbx
2190.cfi_push	%rbx
2191	push	%r12
2192.cfi_push	%r12
2193	push	%r13
2194.cfi_push	%r13
2195	push	%r14
2196.cfi_push	%r14
2197	push	%r15
2198.cfi_push	%r15
2199	lea	-0x48(%rsp), %rsp
2200.cfi_adjust_cfa_offset	0x48
2201___
2202$code.=<<___ if ($win64);
2203	mov	0xa0(%rsp),$arg5	# pull key2
2204	mov	0xa8(%rsp),$arg6	# pull ivp
2205	lea	-0xa0(%rsp), %rsp
2206	movaps	%xmm6, 0x40(%rsp)
2207	movaps	%xmm7, 0x50(%rsp)
2208	movaps	%xmm8, 0x60(%rsp)
2209	movaps	%xmm9, 0x70(%rsp)
2210	movaps	%xmm10, 0x80(%rsp)
2211	movaps	%xmm11, 0x90(%rsp)
2212	movaps	%xmm12, 0xa0(%rsp)
2213	movaps	%xmm13, 0xb0(%rsp)
2214	movaps	%xmm14, 0xc0(%rsp)
2215	movaps	%xmm15, 0xd0(%rsp)
2216.Lxts_enc_body:
2217___
2218$code.=<<___;
2219	mov	%rsp, %rbp		# backup %rsp
2220.cfi_def_cfa_register	%rbp
2221	mov	$arg1, $inp		# backup arguments
2222	mov	$arg2, $out
2223	mov	$arg3, $len
2224	mov	$arg4, $key
2225
2226	lea	($arg6), $arg1
2227	lea	0x20(%rbp), $arg2
2228	lea	($arg5), $arg3
2229	call	asm_AES_encrypt		# generate initial tweak
2230
2231	mov	240($key), %eax		# rounds
2232	mov	$len, %rbx		# backup $len
2233
2234	mov	%eax, %edx		# rounds
2235	shl	\$7, %rax		# 128 bytes per inner round key
2236	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2237	sub	%rax, %rsp
2238
2239	mov	%rsp, %rax		# pass key schedule
2240	mov	$key, %rcx		# pass key
2241	mov	%edx, %r10d		# pass rounds
2242	call	_bsaes_key_convert
2243	pxor	%xmm6, %xmm7		# fix up last round key
2244	movdqa	%xmm7, (%rax)		# save last round key
2245
2246	and	\$-16, $len
2247	sub	\$0x80, %rsp		# place for tweak[8]
2248	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2249
2250	pxor	$twtmp, $twtmp
2251	movdqa	.Lxts_magic(%rip), $twmask
2252	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2253
2254	sub	\$0x80, $len
2255	jc	.Lxts_enc_short
2256	jmp	.Lxts_enc_loop
2257
2258.align	16
2259.Lxts_enc_loop:
2260___
2261    for ($i=0;$i<7;$i++) {
2262    $code.=<<___;
2263	pshufd	\$0x13, $twtmp, $twres
2264	pxor	$twtmp, $twtmp
2265	movdqa	@XMM[7], @XMM[$i]
2266	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2267	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2268	pand	$twmask, $twres		# isolate carry and residue
2269	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2270	pxor	$twres, @XMM[7]
2271___
2272    $code.=<<___ if ($i>=1);
2273	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2274___
2275    $code.=<<___ if ($i>=2);
2276	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2277___
2278    }
2279$code.=<<___;
2280	movdqu	0x60($inp), @XMM[8+6]
2281	pxor	@XMM[8+5], @XMM[5]
2282	movdqu	0x70($inp), @XMM[8+7]
2283	lea	0x80($inp), $inp
2284	movdqa	@XMM[7], 0x70(%rsp)
2285	pxor	@XMM[8+6], @XMM[6]
2286	lea	0x80(%rsp), %rax	# pass key schedule
2287	pxor	@XMM[8+7], @XMM[7]
2288	mov	%edx, %r10d		# pass rounds
2289
2290	call	_bsaes_encrypt8
2291
2292	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2293	pxor	0x10(%rsp), @XMM[1]
2294	movdqu	@XMM[0], 0x00($out)	# write output
2295	pxor	0x20(%rsp), @XMM[4]
2296	movdqu	@XMM[1], 0x10($out)
2297	pxor	0x30(%rsp), @XMM[6]
2298	movdqu	@XMM[4], 0x20($out)
2299	pxor	0x40(%rsp), @XMM[3]
2300	movdqu	@XMM[6], 0x30($out)
2301	pxor	0x50(%rsp), @XMM[7]
2302	movdqu	@XMM[3], 0x40($out)
2303	pxor	0x60(%rsp), @XMM[2]
2304	movdqu	@XMM[7], 0x50($out)
2305	pxor	0x70(%rsp), @XMM[5]
2306	movdqu	@XMM[2], 0x60($out)
2307	movdqu	@XMM[5], 0x70($out)
2308	lea	0x80($out), $out
2309
2310	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2311	pxor	$twtmp, $twtmp
2312	movdqa	.Lxts_magic(%rip), $twmask
2313	pcmpgtd	@XMM[7], $twtmp
2314	pshufd	\$0x13, $twtmp, $twres
2315	pxor	$twtmp, $twtmp
2316	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2317	pand	$twmask, $twres		# isolate carry and residue
2318	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2319	pxor	$twres, @XMM[7]
2320
2321	sub	\$0x80,$len
2322	jnc	.Lxts_enc_loop
2323
2324.Lxts_enc_short:
2325	add	\$0x80, $len
2326	jz	.Lxts_enc_done
2327___
2328    for ($i=0;$i<7;$i++) {
2329    $code.=<<___;
2330	pshufd	\$0x13, $twtmp, $twres
2331	pxor	$twtmp, $twtmp
2332	movdqa	@XMM[7], @XMM[$i]
2333	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2334	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2335	pand	$twmask, $twres		# isolate carry and residue
2336	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2337	pxor	$twres, @XMM[7]
2338___
2339    $code.=<<___ if ($i>=1);
2340	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2341	cmp	\$`0x10*$i`,$len
2342	je	.Lxts_enc_$i
2343___
2344    $code.=<<___ if ($i>=2);
2345	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2346___
2347    }
2348$code.=<<___;
2349	movdqu	0x60($inp), @XMM[8+6]
2350	pxor	@XMM[8+5], @XMM[5]
2351	movdqa	@XMM[7], 0x70(%rsp)
2352	lea	0x70($inp), $inp
2353	pxor	@XMM[8+6], @XMM[6]
2354	lea	0x80(%rsp), %rax	# pass key schedule
2355	mov	%edx, %r10d		# pass rounds
2356
2357	call	_bsaes_encrypt8
2358
2359	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2360	pxor	0x10(%rsp), @XMM[1]
2361	movdqu	@XMM[0], 0x00($out)	# write output
2362	pxor	0x20(%rsp), @XMM[4]
2363	movdqu	@XMM[1], 0x10($out)
2364	pxor	0x30(%rsp), @XMM[6]
2365	movdqu	@XMM[4], 0x20($out)
2366	pxor	0x40(%rsp), @XMM[3]
2367	movdqu	@XMM[6], 0x30($out)
2368	pxor	0x50(%rsp), @XMM[7]
2369	movdqu	@XMM[3], 0x40($out)
2370	pxor	0x60(%rsp), @XMM[2]
2371	movdqu	@XMM[7], 0x50($out)
2372	movdqu	@XMM[2], 0x60($out)
2373	lea	0x70($out), $out
2374
2375	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2376	jmp	.Lxts_enc_done
2377.align	16
2378.Lxts_enc_6:
2379	pxor	@XMM[8+4], @XMM[4]
2380	lea	0x60($inp), $inp
2381	pxor	@XMM[8+5], @XMM[5]
2382	lea	0x80(%rsp), %rax	# pass key schedule
2383	mov	%edx, %r10d		# pass rounds
2384
2385	call	_bsaes_encrypt8
2386
2387	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2388	pxor	0x10(%rsp), @XMM[1]
2389	movdqu	@XMM[0], 0x00($out)	# write output
2390	pxor	0x20(%rsp), @XMM[4]
2391	movdqu	@XMM[1], 0x10($out)
2392	pxor	0x30(%rsp), @XMM[6]
2393	movdqu	@XMM[4], 0x20($out)
2394	pxor	0x40(%rsp), @XMM[3]
2395	movdqu	@XMM[6], 0x30($out)
2396	pxor	0x50(%rsp), @XMM[7]
2397	movdqu	@XMM[3], 0x40($out)
2398	movdqu	@XMM[7], 0x50($out)
2399	lea	0x60($out), $out
2400
2401	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2402	jmp	.Lxts_enc_done
2403.align	16
2404.Lxts_enc_5:
2405	pxor	@XMM[8+3], @XMM[3]
2406	lea	0x50($inp), $inp
2407	pxor	@XMM[8+4], @XMM[4]
2408	lea	0x80(%rsp), %rax	# pass key schedule
2409	mov	%edx, %r10d		# pass rounds
2410
2411	call	_bsaes_encrypt8
2412
2413	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2414	pxor	0x10(%rsp), @XMM[1]
2415	movdqu	@XMM[0], 0x00($out)	# write output
2416	pxor	0x20(%rsp), @XMM[4]
2417	movdqu	@XMM[1], 0x10($out)
2418	pxor	0x30(%rsp), @XMM[6]
2419	movdqu	@XMM[4], 0x20($out)
2420	pxor	0x40(%rsp), @XMM[3]
2421	movdqu	@XMM[6], 0x30($out)
2422	movdqu	@XMM[3], 0x40($out)
2423	lea	0x50($out), $out
2424
2425	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2426	jmp	.Lxts_enc_done
2427.align	16
2428.Lxts_enc_4:
2429	pxor	@XMM[8+2], @XMM[2]
2430	lea	0x40($inp), $inp
2431	pxor	@XMM[8+3], @XMM[3]
2432	lea	0x80(%rsp), %rax	# pass key schedule
2433	mov	%edx, %r10d		# pass rounds
2434
2435	call	_bsaes_encrypt8
2436
2437	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2438	pxor	0x10(%rsp), @XMM[1]
2439	movdqu	@XMM[0], 0x00($out)	# write output
2440	pxor	0x20(%rsp), @XMM[4]
2441	movdqu	@XMM[1], 0x10($out)
2442	pxor	0x30(%rsp), @XMM[6]
2443	movdqu	@XMM[4], 0x20($out)
2444	movdqu	@XMM[6], 0x30($out)
2445	lea	0x40($out), $out
2446
2447	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2448	jmp	.Lxts_enc_done
2449.align	16
2450.Lxts_enc_3:
2451	pxor	@XMM[8+1], @XMM[1]
2452	lea	0x30($inp), $inp
2453	pxor	@XMM[8+2], @XMM[2]
2454	lea	0x80(%rsp), %rax	# pass key schedule
2455	mov	%edx, %r10d		# pass rounds
2456
2457	call	_bsaes_encrypt8
2458
2459	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2460	pxor	0x10(%rsp), @XMM[1]
2461	movdqu	@XMM[0], 0x00($out)	# write output
2462	pxor	0x20(%rsp), @XMM[4]
2463	movdqu	@XMM[1], 0x10($out)
2464	movdqu	@XMM[4], 0x20($out)
2465	lea	0x30($out), $out
2466
2467	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2468	jmp	.Lxts_enc_done
2469.align	16
2470.Lxts_enc_2:
2471	pxor	@XMM[8+0], @XMM[0]
2472	lea	0x20($inp), $inp
2473	pxor	@XMM[8+1], @XMM[1]
2474	lea	0x80(%rsp), %rax	# pass key schedule
2475	mov	%edx, %r10d		# pass rounds
2476
2477	call	_bsaes_encrypt8
2478
2479	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2480	pxor	0x10(%rsp), @XMM[1]
2481	movdqu	@XMM[0], 0x00($out)	# write output
2482	movdqu	@XMM[1], 0x10($out)
2483	lea	0x20($out), $out
2484
2485	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2486	jmp	.Lxts_enc_done
2487.align	16
2488.Lxts_enc_1:
2489	pxor	@XMM[0], @XMM[8]
2490	lea	0x10($inp), $inp
2491	movdqa	@XMM[8], 0x20(%rbp)
2492	lea	0x20(%rbp), $arg1
2493	lea	0x20(%rbp), $arg2
2494	lea	($key), $arg3
2495	call	asm_AES_encrypt		# doesn't touch %xmm
2496	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2497	#pxor	@XMM[8], @XMM[0]
2498	#lea	0x80(%rsp), %rax	# pass key schedule
2499	#mov	%edx, %r10d		# pass rounds
2500	#call	_bsaes_encrypt8
2501	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2502	movdqu	@XMM[0], 0x00($out)	# write output
2503	lea	0x10($out), $out
2504
2505	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2506
2507.Lxts_enc_done:
2508	and	\$15, %ebx
2509	jz	.Lxts_enc_ret
2510	mov	$out, %rdx
2511
2512.Lxts_enc_steal:
2513	movzb	($inp), %eax
2514	movzb	-16(%rdx), %ecx
2515	lea	1($inp), $inp
2516	mov	%al, -16(%rdx)
2517	mov	%cl, 0(%rdx)
2518	lea	1(%rdx), %rdx
2519	sub	\$1,%ebx
2520	jnz	.Lxts_enc_steal
2521
2522	movdqu	-16($out), @XMM[0]
2523	lea	0x20(%rbp), $arg1
2524	pxor	@XMM[7], @XMM[0]
2525	lea	0x20(%rbp), $arg2
2526	movdqa	@XMM[0], 0x20(%rbp)
2527	lea	($key), $arg3
2528	call	asm_AES_encrypt		# doesn't touch %xmm
2529	pxor	0x20(%rbp), @XMM[7]
2530	movdqu	@XMM[7], -16($out)
2531
2532.Lxts_enc_ret:
2533	lea	(%rsp), %rax
2534	pxor	%xmm0, %xmm0
2535.Lxts_enc_bzero:			# wipe key schedule [if any]
2536	movdqa	%xmm0, 0x00(%rax)
2537	movdqa	%xmm0, 0x10(%rax)
2538	lea	0x20(%rax), %rax
2539	cmp	%rax, %rbp
2540	ja	.Lxts_enc_bzero
2541
2542	lea	0x78(%rbp),%rax
2543.cfi_def_cfa	%rax,8
2544___
2545$code.=<<___ if ($win64);
2546	movaps	0x40(%rbp), %xmm6
2547	movaps	0x50(%rbp), %xmm7
2548	movaps	0x60(%rbp), %xmm8
2549	movaps	0x70(%rbp), %xmm9
2550	movaps	0x80(%rbp), %xmm10
2551	movaps	0x90(%rbp), %xmm11
2552	movaps	0xa0(%rbp), %xmm12
2553	movaps	0xb0(%rbp), %xmm13
2554	movaps	0xc0(%rbp), %xmm14
2555	movaps	0xd0(%rbp), %xmm15
2556	lea	0xa0(%rax), %rax
2557.Lxts_enc_tail:
2558___
2559$code.=<<___;
2560	mov	-48(%rax), %r15
2561.cfi_restore	%r15
2562	mov	-40(%rax), %r14
2563.cfi_restore	%r14
2564	mov	-32(%rax), %r13
2565.cfi_restore	%r13
2566	mov	-24(%rax), %r12
2567.cfi_restore	%r12
2568	mov	-16(%rax), %rbx
2569.cfi_restore	%rbx
2570	mov	-8(%rax), %rbp
2571.cfi_restore	%rbp
2572	lea	(%rax), %rsp		# restore %rsp
2573.cfi_def_cfa_register	%rsp
2574.Lxts_enc_epilogue:
2575	ret
2576.cfi_endproc
2577.size	ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
2578
2579.globl	ossl_bsaes_xts_decrypt
2580.type	ossl_bsaes_xts_decrypt,\@abi-omnipotent
2581.align	16
2582ossl_bsaes_xts_decrypt:
2583.cfi_startproc
2584	mov	%rsp, %rax
2585.Lxts_dec_prologue:
2586	push	%rbp
2587.cfi_push	%rbp
2588	push	%rbx
2589.cfi_push	%rbx
2590	push	%r12
2591.cfi_push	%r12
2592	push	%r13
2593.cfi_push	%r13
2594	push	%r14
2595.cfi_push	%r14
2596	push	%r15
2597.cfi_push	%r15
2598	lea	-0x48(%rsp), %rsp
2599.cfi_adjust_cfa_offset	0x48
2600___
2601$code.=<<___ if ($win64);
2602	mov	0xa0(%rsp),$arg5	# pull key2
2603	mov	0xa8(%rsp),$arg6	# pull ivp
2604	lea	-0xa0(%rsp), %rsp
2605	movaps	%xmm6, 0x40(%rsp)
2606	movaps	%xmm7, 0x50(%rsp)
2607	movaps	%xmm8, 0x60(%rsp)
2608	movaps	%xmm9, 0x70(%rsp)
2609	movaps	%xmm10, 0x80(%rsp)
2610	movaps	%xmm11, 0x90(%rsp)
2611	movaps	%xmm12, 0xa0(%rsp)
2612	movaps	%xmm13, 0xb0(%rsp)
2613	movaps	%xmm14, 0xc0(%rsp)
2614	movaps	%xmm15, 0xd0(%rsp)
2615.Lxts_dec_body:
2616___
2617$code.=<<___;
2618	mov	%rsp, %rbp		# backup %rsp
2619	mov	$arg1, $inp		# backup arguments
2620	mov	$arg2, $out
2621	mov	$arg3, $len
2622	mov	$arg4, $key
2623
2624	lea	($arg6), $arg1
2625	lea	0x20(%rbp), $arg2
2626	lea	($arg5), $arg3
2627	call	asm_AES_encrypt		# generate initial tweak
2628
2629	mov	240($key), %eax		# rounds
2630	mov	$len, %rbx		# backup $len
2631
2632	mov	%eax, %edx		# rounds
2633	shl	\$7, %rax		# 128 bytes per inner round key
2634	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2635	sub	%rax, %rsp
2636
2637	mov	%rsp, %rax		# pass key schedule
2638	mov	$key, %rcx		# pass key
2639	mov	%edx, %r10d		# pass rounds
2640	call	_bsaes_key_convert
2641	pxor	(%rsp), %xmm7		# fix up round 0 key
2642	movdqa	%xmm6, (%rax)		# save last round key
2643	movdqa	%xmm7, (%rsp)
2644
2645	xor	%eax, %eax		# if ($len%16) len-=16;
2646	and	\$-16, $len
2647	test	\$15, %ebx
2648	setnz	%al
2649	shl	\$4, %rax
2650	sub	%rax, $len
2651
2652	sub	\$0x80, %rsp		# place for tweak[8]
2653	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2654
2655	pxor	$twtmp, $twtmp
2656	movdqa	.Lxts_magic(%rip), $twmask
2657	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2658
2659	sub	\$0x80, $len
2660	jc	.Lxts_dec_short
2661	jmp	.Lxts_dec_loop
2662
2663.align	16
2664.Lxts_dec_loop:
2665___
2666    for ($i=0;$i<7;$i++) {
2667    $code.=<<___;
2668	pshufd	\$0x13, $twtmp, $twres
2669	pxor	$twtmp, $twtmp
2670	movdqa	@XMM[7], @XMM[$i]
2671	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2672	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2673	pand	$twmask, $twres		# isolate carry and residue
2674	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2675	pxor	$twres, @XMM[7]
2676___
2677    $code.=<<___ if ($i>=1);
2678	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2679___
2680    $code.=<<___ if ($i>=2);
2681	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2682___
2683    }
2684$code.=<<___;
2685	movdqu	0x60($inp), @XMM[8+6]
2686	pxor	@XMM[8+5], @XMM[5]
2687	movdqu	0x70($inp), @XMM[8+7]
2688	lea	0x80($inp), $inp
2689	movdqa	@XMM[7], 0x70(%rsp)
2690	pxor	@XMM[8+6], @XMM[6]
2691	lea	0x80(%rsp), %rax	# pass key schedule
2692	pxor	@XMM[8+7], @XMM[7]
2693	mov	%edx, %r10d		# pass rounds
2694
2695	call	_bsaes_decrypt8
2696
2697	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2698	pxor	0x10(%rsp), @XMM[1]
2699	movdqu	@XMM[0], 0x00($out)	# write output
2700	pxor	0x20(%rsp), @XMM[6]
2701	movdqu	@XMM[1], 0x10($out)
2702	pxor	0x30(%rsp), @XMM[4]
2703	movdqu	@XMM[6], 0x20($out)
2704	pxor	0x40(%rsp), @XMM[2]
2705	movdqu	@XMM[4], 0x30($out)
2706	pxor	0x50(%rsp), @XMM[7]
2707	movdqu	@XMM[2], 0x40($out)
2708	pxor	0x60(%rsp), @XMM[3]
2709	movdqu	@XMM[7], 0x50($out)
2710	pxor	0x70(%rsp), @XMM[5]
2711	movdqu	@XMM[3], 0x60($out)
2712	movdqu	@XMM[5], 0x70($out)
2713	lea	0x80($out), $out
2714
2715	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2716	pxor	$twtmp, $twtmp
2717	movdqa	.Lxts_magic(%rip), $twmask
2718	pcmpgtd	@XMM[7], $twtmp
2719	pshufd	\$0x13, $twtmp, $twres
2720	pxor	$twtmp, $twtmp
2721	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2722	pand	$twmask, $twres		# isolate carry and residue
2723	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2724	pxor	$twres, @XMM[7]
2725
2726	sub	\$0x80,$len
2727	jnc	.Lxts_dec_loop
2728
2729.Lxts_dec_short:
2730	add	\$0x80, $len
2731	jz	.Lxts_dec_done
2732___
2733    for ($i=0;$i<7;$i++) {
2734    $code.=<<___;
2735	pshufd	\$0x13, $twtmp, $twres
2736	pxor	$twtmp, $twtmp
2737	movdqa	@XMM[7], @XMM[$i]
2738	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2739	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2740	pand	$twmask, $twres		# isolate carry and residue
2741	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2742	pxor	$twres, @XMM[7]
2743___
2744    $code.=<<___ if ($i>=1);
2745	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2746	cmp	\$`0x10*$i`,$len
2747	je	.Lxts_dec_$i
2748___
2749    $code.=<<___ if ($i>=2);
2750	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2751___
2752    }
2753$code.=<<___;
2754	movdqu	0x60($inp), @XMM[8+6]
2755	pxor	@XMM[8+5], @XMM[5]
2756	movdqa	@XMM[7], 0x70(%rsp)
2757	lea	0x70($inp), $inp
2758	pxor	@XMM[8+6], @XMM[6]
2759	lea	0x80(%rsp), %rax	# pass key schedule
2760	mov	%edx, %r10d		# pass rounds
2761
2762	call	_bsaes_decrypt8
2763
2764	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2765	pxor	0x10(%rsp), @XMM[1]
2766	movdqu	@XMM[0], 0x00($out)	# write output
2767	pxor	0x20(%rsp), @XMM[6]
2768	movdqu	@XMM[1], 0x10($out)
2769	pxor	0x30(%rsp), @XMM[4]
2770	movdqu	@XMM[6], 0x20($out)
2771	pxor	0x40(%rsp), @XMM[2]
2772	movdqu	@XMM[4], 0x30($out)
2773	pxor	0x50(%rsp), @XMM[7]
2774	movdqu	@XMM[2], 0x40($out)
2775	pxor	0x60(%rsp), @XMM[3]
2776	movdqu	@XMM[7], 0x50($out)
2777	movdqu	@XMM[3], 0x60($out)
2778	lea	0x70($out), $out
2779
2780	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2781	jmp	.Lxts_dec_done
2782.align	16
2783.Lxts_dec_6:
2784	pxor	@XMM[8+4], @XMM[4]
2785	lea	0x60($inp), $inp
2786	pxor	@XMM[8+5], @XMM[5]
2787	lea	0x80(%rsp), %rax	# pass key schedule
2788	mov	%edx, %r10d		# pass rounds
2789
2790	call	_bsaes_decrypt8
2791
2792	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2793	pxor	0x10(%rsp), @XMM[1]
2794	movdqu	@XMM[0], 0x00($out)	# write output
2795	pxor	0x20(%rsp), @XMM[6]
2796	movdqu	@XMM[1], 0x10($out)
2797	pxor	0x30(%rsp), @XMM[4]
2798	movdqu	@XMM[6], 0x20($out)
2799	pxor	0x40(%rsp), @XMM[2]
2800	movdqu	@XMM[4], 0x30($out)
2801	pxor	0x50(%rsp), @XMM[7]
2802	movdqu	@XMM[2], 0x40($out)
2803	movdqu	@XMM[7], 0x50($out)
2804	lea	0x60($out), $out
2805
2806	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2807	jmp	.Lxts_dec_done
2808.align	16
2809.Lxts_dec_5:
2810	pxor	@XMM[8+3], @XMM[3]
2811	lea	0x50($inp), $inp
2812	pxor	@XMM[8+4], @XMM[4]
2813	lea	0x80(%rsp), %rax	# pass key schedule
2814	mov	%edx, %r10d		# pass rounds
2815
2816	call	_bsaes_decrypt8
2817
2818	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2819	pxor	0x10(%rsp), @XMM[1]
2820	movdqu	@XMM[0], 0x00($out)	# write output
2821	pxor	0x20(%rsp), @XMM[6]
2822	movdqu	@XMM[1], 0x10($out)
2823	pxor	0x30(%rsp), @XMM[4]
2824	movdqu	@XMM[6], 0x20($out)
2825	pxor	0x40(%rsp), @XMM[2]
2826	movdqu	@XMM[4], 0x30($out)
2827	movdqu	@XMM[2], 0x40($out)
2828	lea	0x50($out), $out
2829
2830	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2831	jmp	.Lxts_dec_done
2832.align	16
2833.Lxts_dec_4:
2834	pxor	@XMM[8+2], @XMM[2]
2835	lea	0x40($inp), $inp
2836	pxor	@XMM[8+3], @XMM[3]
2837	lea	0x80(%rsp), %rax	# pass key schedule
2838	mov	%edx, %r10d		# pass rounds
2839
2840	call	_bsaes_decrypt8
2841
2842	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2843	pxor	0x10(%rsp), @XMM[1]
2844	movdqu	@XMM[0], 0x00($out)	# write output
2845	pxor	0x20(%rsp), @XMM[6]
2846	movdqu	@XMM[1], 0x10($out)
2847	pxor	0x30(%rsp), @XMM[4]
2848	movdqu	@XMM[6], 0x20($out)
2849	movdqu	@XMM[4], 0x30($out)
2850	lea	0x40($out), $out
2851
2852	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2853	jmp	.Lxts_dec_done
2854.align	16
2855.Lxts_dec_3:
2856	pxor	@XMM[8+1], @XMM[1]
2857	lea	0x30($inp), $inp
2858	pxor	@XMM[8+2], @XMM[2]
2859	lea	0x80(%rsp), %rax	# pass key schedule
2860	mov	%edx, %r10d		# pass rounds
2861
2862	call	_bsaes_decrypt8
2863
2864	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2865	pxor	0x10(%rsp), @XMM[1]
2866	movdqu	@XMM[0], 0x00($out)	# write output
2867	pxor	0x20(%rsp), @XMM[6]
2868	movdqu	@XMM[1], 0x10($out)
2869	movdqu	@XMM[6], 0x20($out)
2870	lea	0x30($out), $out
2871
2872	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2873	jmp	.Lxts_dec_done
2874.align	16
2875.Lxts_dec_2:
2876	pxor	@XMM[8+0], @XMM[0]
2877	lea	0x20($inp), $inp
2878	pxor	@XMM[8+1], @XMM[1]
2879	lea	0x80(%rsp), %rax	# pass key schedule
2880	mov	%edx, %r10d		# pass rounds
2881
2882	call	_bsaes_decrypt8
2883
2884	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2885	pxor	0x10(%rsp), @XMM[1]
2886	movdqu	@XMM[0], 0x00($out)	# write output
2887	movdqu	@XMM[1], 0x10($out)
2888	lea	0x20($out), $out
2889
2890	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2891	jmp	.Lxts_dec_done
2892.align	16
2893.Lxts_dec_1:
2894	pxor	@XMM[0], @XMM[8]
2895	lea	0x10($inp), $inp
2896	movdqa	@XMM[8], 0x20(%rbp)
2897	lea	0x20(%rbp), $arg1
2898	lea	0x20(%rbp), $arg2
2899	lea	($key), $arg3
2900	call	asm_AES_decrypt		# doesn't touch %xmm
2901	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2902	#pxor	@XMM[8], @XMM[0]
2903	#lea	0x80(%rsp), %rax	# pass key schedule
2904	#mov	%edx, %r10d		# pass rounds
2905	#call	_bsaes_decrypt8
2906	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2907	movdqu	@XMM[0], 0x00($out)	# write output
2908	lea	0x10($out), $out
2909
2910	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2911
2912.Lxts_dec_done:
2913	and	\$15, %ebx
2914	jz	.Lxts_dec_ret
2915
2916	pxor	$twtmp, $twtmp
2917	movdqa	.Lxts_magic(%rip), $twmask
2918	pcmpgtd	@XMM[7], $twtmp
2919	pshufd	\$0x13, $twtmp, $twres
2920	movdqa	@XMM[7], @XMM[6]
2921	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2922	pand	$twmask, $twres		# isolate carry and residue
2923	movdqu	($inp), @XMM[0]
2924	pxor	$twres, @XMM[7]
2925
2926	lea	0x20(%rbp), $arg1
2927	pxor	@XMM[7], @XMM[0]
2928	lea	0x20(%rbp), $arg2
2929	movdqa	@XMM[0], 0x20(%rbp)
2930	lea	($key), $arg3
2931	call	asm_AES_decrypt		# doesn't touch %xmm
2932	pxor	0x20(%rbp), @XMM[7]
2933	mov	$out, %rdx
2934	movdqu	@XMM[7], ($out)
2935
2936.Lxts_dec_steal:
2937	movzb	16($inp), %eax
2938	movzb	(%rdx), %ecx
2939	lea	1($inp), $inp
2940	mov	%al, (%rdx)
2941	mov	%cl, 16(%rdx)
2942	lea	1(%rdx), %rdx
2943	sub	\$1,%ebx
2944	jnz	.Lxts_dec_steal
2945
2946	movdqu	($out), @XMM[0]
2947	lea	0x20(%rbp), $arg1
2948	pxor	@XMM[6], @XMM[0]
2949	lea	0x20(%rbp), $arg2
2950	movdqa	@XMM[0], 0x20(%rbp)
2951	lea	($key), $arg3
2952	call	asm_AES_decrypt		# doesn't touch %xmm
2953	pxor	0x20(%rbp), @XMM[6]
2954	movdqu	@XMM[6], ($out)
2955
2956.Lxts_dec_ret:
2957	lea	(%rsp), %rax
2958	pxor	%xmm0, %xmm0
2959.Lxts_dec_bzero:			# wipe key schedule [if any]
2960	movdqa	%xmm0, 0x00(%rax)
2961	movdqa	%xmm0, 0x10(%rax)
2962	lea	0x20(%rax), %rax
2963	cmp	%rax, %rbp
2964	ja	.Lxts_dec_bzero
2965
2966	lea	0x78(%rbp),%rax
2967.cfi_def_cfa	%rax,8
2968___
2969$code.=<<___ if ($win64);
2970	movaps	0x40(%rbp), %xmm6
2971	movaps	0x50(%rbp), %xmm7
2972	movaps	0x60(%rbp), %xmm8
2973	movaps	0x70(%rbp), %xmm9
2974	movaps	0x80(%rbp), %xmm10
2975	movaps	0x90(%rbp), %xmm11
2976	movaps	0xa0(%rbp), %xmm12
2977	movaps	0xb0(%rbp), %xmm13
2978	movaps	0xc0(%rbp), %xmm14
2979	movaps	0xd0(%rbp), %xmm15
2980	lea	0xa0(%rax), %rax
2981.Lxts_dec_tail:
2982___
2983$code.=<<___;
2984	mov	-48(%rax), %r15
2985.cfi_restore	%r15
2986	mov	-40(%rax), %r14
2987.cfi_restore	%r14
2988	mov	-32(%rax), %r13
2989.cfi_restore	%r13
2990	mov	-24(%rax), %r12
2991.cfi_restore	%r12
2992	mov	-16(%rax), %rbx
2993.cfi_restore	%rbx
2994	mov	-8(%rax), %rbp
2995.cfi_restore	%rbp
2996	lea	(%rax), %rsp		# restore %rsp
2997.cfi_def_cfa_register	%rsp
2998.Lxts_dec_epilogue:
2999	ret
3000.cfi_endproc
3001.size	ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt
3002___
3003}
3004$code.=<<___;
3005.type	_bsaes_const,\@object
3006.align	64
3007_bsaes_const:
3008.LM0ISR:	# InvShiftRows constants
3009	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
3010.LISRM0:
3011	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
3012.LISR:
3013	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
3014.LBS0:		# bit-slice constants
3015	.quad	0x5555555555555555, 0x5555555555555555
3016.LBS1:
3017	.quad	0x3333333333333333, 0x3333333333333333
3018.LBS2:
3019	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
3020.LSR:		# shiftrows constants
3021	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
3022.LSRM0:
3023	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
3024.LM0SR:
3025	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
3026.LSWPUP:	# byte-swap upper dword
3027	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
3028.LSWPUPM0SR:
3029	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
3030.LADD1:		# counter increment constants
3031	.quad	0x0000000000000000, 0x0000000100000000
3032.LADD2:
3033	.quad	0x0000000000000000, 0x0000000200000000
3034.LADD3:
3035	.quad	0x0000000000000000, 0x0000000300000000
3036.LADD4:
3037	.quad	0x0000000000000000, 0x0000000400000000
3038.LADD5:
3039	.quad	0x0000000000000000, 0x0000000500000000
3040.LADD6:
3041	.quad	0x0000000000000000, 0x0000000600000000
3042.LADD7:
3043	.quad	0x0000000000000000, 0x0000000700000000
3044.LADD8:
3045	.quad	0x0000000000000000, 0x0000000800000000
3046.Lxts_magic:
3047	.long	0x87,0,1,0
3048.Lmasks:
3049	.quad	0x0101010101010101, 0x0101010101010101
3050	.quad	0x0202020202020202, 0x0202020202020202
3051	.quad	0x0404040404040404, 0x0404040404040404
3052	.quad	0x0808080808080808, 0x0808080808080808
3053.LM0:
3054	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
3055.L63:
3056	.quad	0x6363636363636363, 0x6363636363636363
3057.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
3058.align	64
3059.size	_bsaes_const,.-_bsaes_const
3060___
3061
3062# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3063#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3064if ($win64) {
3065$rec="%rcx";
3066$frame="%rdx";
3067$context="%r8";
3068$disp="%r9";
3069
3070$code.=<<___;
3071.extern	__imp_RtlVirtualUnwind
3072.type	se_handler,\@abi-omnipotent
3073.align	16
3074se_handler:
3075	push	%rsi
3076	push	%rdi
3077	push	%rbx
3078	push	%rbp
3079	push	%r12
3080	push	%r13
3081	push	%r14
3082	push	%r15
3083	pushfq
3084	sub	\$64,%rsp
3085
3086	mov	120($context),%rax	# pull context->Rax
3087	mov	248($context),%rbx	# pull context->Rip
3088
3089	mov	8($disp),%rsi		# disp->ImageBase
3090	mov	56($disp),%r11		# disp->HandlerData
3091
3092	mov	0(%r11),%r10d		# HandlerData[0]
3093	lea	(%rsi,%r10),%r10	# prologue label
3094	cmp	%r10,%rbx		# context->Rip<=prologue label
3095	jbe	.Lin_prologue
3096
3097	mov	4(%r11),%r10d		# HandlerData[1]
3098	lea	(%rsi,%r10),%r10	# epilogue label
3099	cmp	%r10,%rbx		# context->Rip>=epilogue label
3100	jae	.Lin_prologue
3101
3102	mov	8(%r11),%r10d		# HandlerData[2]
3103	lea	(%rsi,%r10),%r10	# epilogue label
3104	cmp	%r10,%rbx		# context->Rip>=tail label
3105	jae	.Lin_tail
3106
3107	mov	160($context),%rax	# pull context->Rbp
3108
3109	lea	0x40(%rax),%rsi		# %xmm save area
3110	lea	512($context),%rdi	# &context.Xmm6
3111	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
3112	.long	0xa548f3fc		# cld; rep movsq
3113	lea	0xa0+0x78(%rax),%rax	# adjust stack pointer
3114
3115.Lin_tail:
3116	mov	-48(%rax),%rbp
3117	mov	-40(%rax),%rbx
3118	mov	-32(%rax),%r12
3119	mov	-24(%rax),%r13
3120	mov	-16(%rax),%r14
3121	mov	-8(%rax),%r15
3122	mov	%rbx,144($context)	# restore context->Rbx
3123	mov	%rbp,160($context)	# restore context->Rbp
3124	mov	%r12,216($context)	# restore context->R12
3125	mov	%r13,224($context)	# restore context->R13
3126	mov	%r14,232($context)	# restore context->R14
3127	mov	%r15,240($context)	# restore context->R15
3128
3129.Lin_prologue:
3130	mov	%rax,152($context)	# restore context->Rsp
3131
3132	mov	40($disp),%rdi		# disp->ContextRecord
3133	mov	$context,%rsi		# context
3134	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
3135	.long	0xa548f3fc		# cld; rep movsq
3136
3137	mov	$disp,%rsi
3138	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3139	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3140	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3141	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3142	mov	40(%rsi),%r10		# disp->ContextRecord
3143	lea	56(%rsi),%r11		# &disp->HandlerData
3144	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3145	mov	%r10,32(%rsp)		# arg5
3146	mov	%r11,40(%rsp)		# arg6
3147	mov	%r12,48(%rsp)		# arg7
3148	mov	%rcx,56(%rsp)		# arg8, (NULL)
3149	call	*__imp_RtlVirtualUnwind(%rip)
3150
3151	mov	\$1,%eax		# ExceptionContinueSearch
3152	add	\$64,%rsp
3153	popfq
3154	pop	%r15
3155	pop	%r14
3156	pop	%r13
3157	pop	%r12
3158	pop	%rbp
3159	pop	%rbx
3160	pop	%rdi
3161	pop	%rsi
3162	ret
3163.size	se_handler,.-se_handler
3164
3165.section	.pdata
3166.align	4
3167___
3168$code.=<<___ if ($ecb);
3169	.rva	.Lecb_enc_prologue
3170	.rva	.Lecb_enc_epilogue
3171	.rva	.Lecb_enc_info
3172
3173	.rva	.Lecb_dec_prologue
3174	.rva	.Lecb_dec_epilogue
3175	.rva	.Lecb_dec_info
3176___
3177$code.=<<___;
3178	.rva	.Lcbc_dec_prologue
3179	.rva	.Lcbc_dec_epilogue
3180	.rva	.Lcbc_dec_info
3181
3182	.rva	.Lctr_enc_prologue
3183	.rva	.Lctr_enc_epilogue
3184	.rva	.Lctr_enc_info
3185
3186	.rva	.Lxts_enc_prologue
3187	.rva	.Lxts_enc_epilogue
3188	.rva	.Lxts_enc_info
3189
3190	.rva	.Lxts_dec_prologue
3191	.rva	.Lxts_dec_epilogue
3192	.rva	.Lxts_dec_info
3193
3194.section	.xdata
3195.align	8
3196___
3197$code.=<<___ if ($ecb);
3198.Lecb_enc_info:
3199	.byte	9,0,0,0
3200	.rva	se_handler
3201	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3202	.rva	.Lecb_enc_tail
3203	.long	0
3204.Lecb_dec_info:
3205	.byte	9,0,0,0
3206	.rva	se_handler
3207	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3208	.rva	.Lecb_dec_tail
3209	.long	0
3210___
3211$code.=<<___;
3212.Lcbc_dec_info:
3213	.byte	9,0,0,0
3214	.rva	se_handler
3215	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3216	.rva	.Lcbc_dec_tail
3217	.long	0
3218.Lctr_enc_info:
3219	.byte	9,0,0,0
3220	.rva	se_handler
3221	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3222	.rva	.Lctr_enc_tail
3223	.long	0
3224.Lxts_enc_info:
3225	.byte	9,0,0,0
3226	.rva	se_handler
3227	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3228	.rva	.Lxts_enc_tail
3229	.long	0
3230.Lxts_dec_info:
3231	.byte	9,0,0,0
3232	.rva	se_handler
3233	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3234	.rva	.Lxts_dec_tail
3235	.long	0
3236___
3237}
3238
3239$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3240
3241print $code;
3242
3243close STDOUT or die "error closing STDOUT: $!";
3244