1#! /usr/bin/env perl
2# Copyright 2009-2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
20# details].
21#
22# Performance.
23#
24# To start with see corresponding paragraph in aesni-x86_64.pl...
25# Instead of filling table similar to one found there I've chosen to
26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27# The simplified table below represents 32-bit performance relative
28# to 64-bit one in every given point. Ratios vary for different
29# encryption modes, therefore interval values.
30#
31#	16-byte     64-byte     256-byte    1-KB        8-KB
32#	53-67%      67-84%      91-94%      95-98%      97-99.5%
33#
34# Lower ratios for smaller block sizes are perfectly understandable,
35# because function call overhead is higher in 32-bit mode. Largest
36# 8-KB block performance is virtually same: 32-bit code is less than
37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
38
39# January 2011
40#
41# See aesni-x86_64.pl for details. Unlike x86_64 version this module
42# interleaves at most 6 aes[enc|dec] instructions, because there are
43# not enough registers for 8x interleave [which should be optimal for
44# Sandy Bridge]. Actually, performance results for 6x interleave
45# factor presented in aesni-x86_64.pl (except for CTR) are for this
46# module.
47
48# April 2011
49#
50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
52
53# November 2015
54#
55# Add aesni_ocb_[en|de]crypt.
56
57######################################################################
58# Current large-block performance in cycles per byte processed with
59# 128-bit key (less is better).
60#
61#		CBC en-/decrypt	CTR	XTS	ECB	OCB
62# Westmere	3.77/1.37	1.37	1.52	1.27
63# * Bridge	5.07/0.98	0.99	1.09	0.91	1.10
64# Haswell	4.44/0.80	0.97	1.03	0.72	0.76
65# Skylake	2.68/0.65	0.65	0.66	0.64	0.66
66# Silvermont	5.77/3.56	3.67	4.03	3.46	4.03
67# Goldmont	3.84/1.39	1.39	1.63	1.31	1.70
68# Bulldozer	5.80/0.98	1.05	1.24	0.93	1.23
69
70$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
71			# generates drop-in replacement for
72			# crypto/aes/asm/aes-586.pl:-)
73$inline=1;		# inline _aesni_[en|de]crypt
74
75$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
76push(@INC,"${dir}","${dir}../../perlasm");
77require "x86asm.pl";
78
79$output = pop;
80open OUT,">$output";
81*STDOUT=*OUT;
82
83&asm_init($ARGV[0]);
84
85&external_label("OPENSSL_ia32cap_P");
86&static_label("key_const");
87
88if ($PREFIX eq "aesni")	{ $movekey=\&movups; }
89else			{ $movekey=\&movups; }
90
91$len="eax";
92$rounds="ecx";
93$key="edx";
94$inp="esi";
95$out="edi";
96$rounds_="ebx";	# backup copy for $rounds
97$key_="ebp";	# backup copy for $key
98
99$rndkey0="xmm0";
100$rndkey1="xmm1";
101$inout0="xmm2";
102$inout1="xmm3";
103$inout2="xmm4";
104$inout3="xmm5";	$in1="xmm5";
105$inout4="xmm6";	$in0="xmm6";
106$inout5="xmm7";	$ivec="xmm7";
107
108# AESNI extension
109sub aeskeygenassist
110{ my($dst,$src,$imm)=@_;
111    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
112    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
113}
114sub aescommon
115{ my($opcodelet,$dst,$src)=@_;
116    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
117    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
118}
119sub aesimc	{ aescommon(0xdb,@_); }
120sub aesenc	{ aescommon(0xdc,@_); }
121sub aesenclast	{ aescommon(0xdd,@_); }
122sub aesdec	{ aescommon(0xde,@_); }
123sub aesdeclast	{ aescommon(0xdf,@_); }
124
125# Inline version of internal aesni_[en|de]crypt1
126{ my $sn;
127sub aesni_inline_generate1
128{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
129  $sn++;
130
131    &$movekey		($rndkey0,&QWP(0,$key));
132    &$movekey		($rndkey1,&QWP(16,$key));
133    &xorps		($ivec,$rndkey0)	if (defined($ivec));
134    &lea		($key,&DWP(32,$key));
135    &xorps		($inout,$ivec)		if (defined($ivec));
136    &xorps		($inout,$rndkey0)	if (!defined($ivec));
137    &set_label("${p}1_loop_$sn");
138	eval"&aes${p}	($inout,$rndkey1)";
139	&dec		($rounds);
140	&$movekey	($rndkey1,&QWP(0,$key));
141	&lea		($key,&DWP(16,$key));
142    &jnz		(&label("${p}1_loop_$sn"));
143    eval"&aes${p}last	($inout,$rndkey1)";
144}}
145
146sub aesni_generate1	# fully unrolled loop
147{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
148
149    &function_begin_B("_aesni_${p}rypt1");
150	&movups		($rndkey0,&QWP(0,$key));
151	&$movekey	($rndkey1,&QWP(0x10,$key));
152	&xorps		($inout,$rndkey0);
153	&$movekey	($rndkey0,&QWP(0x20,$key));
154	&lea		($key,&DWP(0x30,$key));
155	&cmp		($rounds,11);
156	&jb		(&label("${p}128"));
157	&lea		($key,&DWP(0x20,$key));
158	&je		(&label("${p}192"));
159	&lea		($key,&DWP(0x20,$key));
160	eval"&aes${p}	($inout,$rndkey1)";
161	&$movekey	($rndkey1,&QWP(-0x40,$key));
162	eval"&aes${p}	($inout,$rndkey0)";
163	&$movekey	($rndkey0,&QWP(-0x30,$key));
164    &set_label("${p}192");
165	eval"&aes${p}	($inout,$rndkey1)";
166	&$movekey	($rndkey1,&QWP(-0x20,$key));
167	eval"&aes${p}	($inout,$rndkey0)";
168	&$movekey	($rndkey0,&QWP(-0x10,$key));
169    &set_label("${p}128");
170	eval"&aes${p}	($inout,$rndkey1)";
171	&$movekey	($rndkey1,&QWP(0,$key));
172	eval"&aes${p}	($inout,$rndkey0)";
173	&$movekey	($rndkey0,&QWP(0x10,$key));
174	eval"&aes${p}	($inout,$rndkey1)";
175	&$movekey	($rndkey1,&QWP(0x20,$key));
176	eval"&aes${p}	($inout,$rndkey0)";
177	&$movekey	($rndkey0,&QWP(0x30,$key));
178	eval"&aes${p}	($inout,$rndkey1)";
179	&$movekey	($rndkey1,&QWP(0x40,$key));
180	eval"&aes${p}	($inout,$rndkey0)";
181	&$movekey	($rndkey0,&QWP(0x50,$key));
182	eval"&aes${p}	($inout,$rndkey1)";
183	&$movekey	($rndkey1,&QWP(0x60,$key));
184	eval"&aes${p}	($inout,$rndkey0)";
185	&$movekey	($rndkey0,&QWP(0x70,$key));
186	eval"&aes${p}	($inout,$rndkey1)";
187    eval"&aes${p}last	($inout,$rndkey0)";
188    &ret();
189    &function_end_B("_aesni_${p}rypt1");
190}
191
192# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
193&aesni_generate1("enc") if (!$inline);
194&function_begin_B("${PREFIX}_encrypt");
195	&mov	("eax",&wparam(0));
196	&mov	($key,&wparam(2));
197	&movups	($inout0,&QWP(0,"eax"));
198	&mov	($rounds,&DWP(240,$key));
199	&mov	("eax",&wparam(1));
200	if ($inline)
201	{   &aesni_inline_generate1("enc");	}
202	else
203	{   &call	("_aesni_encrypt1");	}
204	&pxor	($rndkey0,$rndkey0);		# clear register bank
205	&pxor	($rndkey1,$rndkey1);
206	&movups	(&QWP(0,"eax"),$inout0);
207	&pxor	($inout0,$inout0);
208	&ret	();
209&function_end_B("${PREFIX}_encrypt");
210
211# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
212&aesni_generate1("dec") if(!$inline);
213&function_begin_B("${PREFIX}_decrypt");
214	&mov	("eax",&wparam(0));
215	&mov	($key,&wparam(2));
216	&movups	($inout0,&QWP(0,"eax"));
217	&mov	($rounds,&DWP(240,$key));
218	&mov	("eax",&wparam(1));
219	if ($inline)
220	{   &aesni_inline_generate1("dec");	}
221	else
222	{   &call	("_aesni_decrypt1");	}
223	&pxor	($rndkey0,$rndkey0);		# clear register bank
224	&pxor	($rndkey1,$rndkey1);
225	&movups	(&QWP(0,"eax"),$inout0);
226	&pxor	($inout0,$inout0);
227	&ret	();
228&function_end_B("${PREFIX}_decrypt");
229
230# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
231# factor. Why 3x subroutine were originally used in loops? Even though
232# aes[enc|dec] latency was originally 6, it could be scheduled only
233# every *2nd* cycle. Thus 3x interleave was the one providing optimal
234# utilization, i.e. when subroutine's throughput is virtually same as
235# of non-interleaved subroutine [for number of input blocks up to 3].
236# This is why it originally made no sense to implement 2x subroutine.
237# But times change and it became appropriate to spend extra 192 bytes
238# on 2x subroutine on Atom Silvermont account. For processors that
239# can schedule aes[enc|dec] every cycle optimal interleave factor
240# equals to corresponding instructions latency. 8x is optimal for
241# * Bridge, but it's unfeasible to accommodate such implementation
242# in XMM registers addressable in 32-bit mode and therefore maximum
243# of 6x is used instead...
244
245sub aesni_generate2
246{ my $p=shift;
247
248    &function_begin_B("_aesni_${p}rypt2");
249	&$movekey	($rndkey0,&QWP(0,$key));
250	&shl		($rounds,4);
251	&$movekey	($rndkey1,&QWP(16,$key));
252	&xorps		($inout0,$rndkey0);
253	&pxor		($inout1,$rndkey0);
254	&$movekey	($rndkey0,&QWP(32,$key));
255	&lea		($key,&DWP(32,$key,$rounds));
256	&neg		($rounds);
257	&add		($rounds,16);
258
259    &set_label("${p}2_loop");
260	eval"&aes${p}	($inout0,$rndkey1)";
261	eval"&aes${p}	($inout1,$rndkey1)";
262	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
263	&add		($rounds,32);
264	eval"&aes${p}	($inout0,$rndkey0)";
265	eval"&aes${p}	($inout1,$rndkey0)";
266	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
267	&jnz		(&label("${p}2_loop"));
268    eval"&aes${p}	($inout0,$rndkey1)";
269    eval"&aes${p}	($inout1,$rndkey1)";
270    eval"&aes${p}last	($inout0,$rndkey0)";
271    eval"&aes${p}last	($inout1,$rndkey0)";
272    &ret();
273    &function_end_B("_aesni_${p}rypt2");
274}
275
276sub aesni_generate3
277{ my $p=shift;
278
279    &function_begin_B("_aesni_${p}rypt3");
280	&$movekey	($rndkey0,&QWP(0,$key));
281	&shl		($rounds,4);
282	&$movekey	($rndkey1,&QWP(16,$key));
283	&xorps		($inout0,$rndkey0);
284	&pxor		($inout1,$rndkey0);
285	&pxor		($inout2,$rndkey0);
286	&$movekey	($rndkey0,&QWP(32,$key));
287	&lea		($key,&DWP(32,$key,$rounds));
288	&neg		($rounds);
289	&add		($rounds,16);
290
291    &set_label("${p}3_loop");
292	eval"&aes${p}	($inout0,$rndkey1)";
293	eval"&aes${p}	($inout1,$rndkey1)";
294	eval"&aes${p}	($inout2,$rndkey1)";
295	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
296	&add		($rounds,32);
297	eval"&aes${p}	($inout0,$rndkey0)";
298	eval"&aes${p}	($inout1,$rndkey0)";
299	eval"&aes${p}	($inout2,$rndkey0)";
300	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
301	&jnz		(&label("${p}3_loop"));
302    eval"&aes${p}	($inout0,$rndkey1)";
303    eval"&aes${p}	($inout1,$rndkey1)";
304    eval"&aes${p}	($inout2,$rndkey1)";
305    eval"&aes${p}last	($inout0,$rndkey0)";
306    eval"&aes${p}last	($inout1,$rndkey0)";
307    eval"&aes${p}last	($inout2,$rndkey0)";
308    &ret();
309    &function_end_B("_aesni_${p}rypt3");
310}
311
312# 4x interleave is implemented to improve small block performance,
313# most notably [and naturally] 4 block by ~30%. One can argue that one
314# should have implemented 5x as well, but improvement  would be <20%,
315# so it's not worth it...
316sub aesni_generate4
317{ my $p=shift;
318
319    &function_begin_B("_aesni_${p}rypt4");
320	&$movekey	($rndkey0,&QWP(0,$key));
321	&$movekey	($rndkey1,&QWP(16,$key));
322	&shl		($rounds,4);
323	&xorps		($inout0,$rndkey0);
324	&pxor		($inout1,$rndkey0);
325	&pxor		($inout2,$rndkey0);
326	&pxor		($inout3,$rndkey0);
327	&$movekey	($rndkey0,&QWP(32,$key));
328	&lea		($key,&DWP(32,$key,$rounds));
329	&neg		($rounds);
330	&data_byte	(0x0f,0x1f,0x40,0x00);
331	&add		($rounds,16);
332
333    &set_label("${p}4_loop");
334	eval"&aes${p}	($inout0,$rndkey1)";
335	eval"&aes${p}	($inout1,$rndkey1)";
336	eval"&aes${p}	($inout2,$rndkey1)";
337	eval"&aes${p}	($inout3,$rndkey1)";
338	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
339	&add		($rounds,32);
340	eval"&aes${p}	($inout0,$rndkey0)";
341	eval"&aes${p}	($inout1,$rndkey0)";
342	eval"&aes${p}	($inout2,$rndkey0)";
343	eval"&aes${p}	($inout3,$rndkey0)";
344	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
345    &jnz		(&label("${p}4_loop"));
346
347    eval"&aes${p}	($inout0,$rndkey1)";
348    eval"&aes${p}	($inout1,$rndkey1)";
349    eval"&aes${p}	($inout2,$rndkey1)";
350    eval"&aes${p}	($inout3,$rndkey1)";
351    eval"&aes${p}last	($inout0,$rndkey0)";
352    eval"&aes${p}last	($inout1,$rndkey0)";
353    eval"&aes${p}last	($inout2,$rndkey0)";
354    eval"&aes${p}last	($inout3,$rndkey0)";
355    &ret();
356    &function_end_B("_aesni_${p}rypt4");
357}
358
359sub aesni_generate6
360{ my $p=shift;
361
362    &function_begin_B("_aesni_${p}rypt6");
363    &static_label("_aesni_${p}rypt6_enter");
364	&$movekey	($rndkey0,&QWP(0,$key));
365	&shl		($rounds,4);
366	&$movekey	($rndkey1,&QWP(16,$key));
367	&xorps		($inout0,$rndkey0);
368	&pxor		($inout1,$rndkey0);	# pxor does better here
369	&pxor		($inout2,$rndkey0);
370	eval"&aes${p}	($inout0,$rndkey1)";
371	&pxor		($inout3,$rndkey0);
372	&pxor		($inout4,$rndkey0);
373	eval"&aes${p}	($inout1,$rndkey1)";
374	&lea		($key,&DWP(32,$key,$rounds));
375	&neg		($rounds);
376	eval"&aes${p}	($inout2,$rndkey1)";
377	&pxor		($inout5,$rndkey0);
378	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
379	&add		($rounds,16);
380	&jmp		(&label("_aesni_${p}rypt6_inner"));
381
382    &set_label("${p}6_loop",16);
383	eval"&aes${p}	($inout0,$rndkey1)";
384	eval"&aes${p}	($inout1,$rndkey1)";
385	eval"&aes${p}	($inout2,$rndkey1)";
386    &set_label("_aesni_${p}rypt6_inner");
387	eval"&aes${p}	($inout3,$rndkey1)";
388	eval"&aes${p}	($inout4,$rndkey1)";
389	eval"&aes${p}	($inout5,$rndkey1)";
390    &set_label("_aesni_${p}rypt6_enter");
391	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
392	&add		($rounds,32);
393	eval"&aes${p}	($inout0,$rndkey0)";
394	eval"&aes${p}	($inout1,$rndkey0)";
395	eval"&aes${p}	($inout2,$rndkey0)";
396	eval"&aes${p}	($inout3,$rndkey0)";
397	eval"&aes${p}	($inout4,$rndkey0)";
398	eval"&aes${p}	($inout5,$rndkey0)";
399	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
400    &jnz		(&label("${p}6_loop"));
401
402    eval"&aes${p}	($inout0,$rndkey1)";
403    eval"&aes${p}	($inout1,$rndkey1)";
404    eval"&aes${p}	($inout2,$rndkey1)";
405    eval"&aes${p}	($inout3,$rndkey1)";
406    eval"&aes${p}	($inout4,$rndkey1)";
407    eval"&aes${p}	($inout5,$rndkey1)";
408    eval"&aes${p}last	($inout0,$rndkey0)";
409    eval"&aes${p}last	($inout1,$rndkey0)";
410    eval"&aes${p}last	($inout2,$rndkey0)";
411    eval"&aes${p}last	($inout3,$rndkey0)";
412    eval"&aes${p}last	($inout4,$rndkey0)";
413    eval"&aes${p}last	($inout5,$rndkey0)";
414    &ret();
415    &function_end_B("_aesni_${p}rypt6");
416}
417&aesni_generate2("enc") if ($PREFIX eq "aesni");
418&aesni_generate2("dec");
419&aesni_generate3("enc") if ($PREFIX eq "aesni");
420&aesni_generate3("dec");
421&aesni_generate4("enc") if ($PREFIX eq "aesni");
422&aesni_generate4("dec");
423&aesni_generate6("enc") if ($PREFIX eq "aesni");
424&aesni_generate6("dec");
425
426if ($PREFIX eq "aesni") {
427######################################################################
428# void aesni_ecb_encrypt (const void *in, void *out,
429#                         size_t length, const AES_KEY *key,
430#                         int enc);
431&function_begin("aesni_ecb_encrypt");
432	&mov	($inp,&wparam(0));
433	&mov	($out,&wparam(1));
434	&mov	($len,&wparam(2));
435	&mov	($key,&wparam(3));
436	&mov	($rounds_,&wparam(4));
437	&and	($len,-16);
438	&jz	(&label("ecb_ret"));
439	&mov	($rounds,&DWP(240,$key));
440	&test	($rounds_,$rounds_);
441	&jz	(&label("ecb_decrypt"));
442
443	&mov	($key_,$key);		# backup $key
444	&mov	($rounds_,$rounds);	# backup $rounds
445	&cmp	($len,0x60);
446	&jb	(&label("ecb_enc_tail"));
447
448	&movdqu	($inout0,&QWP(0,$inp));
449	&movdqu	($inout1,&QWP(0x10,$inp));
450	&movdqu	($inout2,&QWP(0x20,$inp));
451	&movdqu	($inout3,&QWP(0x30,$inp));
452	&movdqu	($inout4,&QWP(0x40,$inp));
453	&movdqu	($inout5,&QWP(0x50,$inp));
454	&lea	($inp,&DWP(0x60,$inp));
455	&sub	($len,0x60);
456	&jmp	(&label("ecb_enc_loop6_enter"));
457
458&set_label("ecb_enc_loop6",16);
459	&movups	(&QWP(0,$out),$inout0);
460	&movdqu	($inout0,&QWP(0,$inp));
461	&movups	(&QWP(0x10,$out),$inout1);
462	&movdqu	($inout1,&QWP(0x10,$inp));
463	&movups	(&QWP(0x20,$out),$inout2);
464	&movdqu	($inout2,&QWP(0x20,$inp));
465	&movups	(&QWP(0x30,$out),$inout3);
466	&movdqu	($inout3,&QWP(0x30,$inp));
467	&movups	(&QWP(0x40,$out),$inout4);
468	&movdqu	($inout4,&QWP(0x40,$inp));
469	&movups	(&QWP(0x50,$out),$inout5);
470	&lea	($out,&DWP(0x60,$out));
471	&movdqu	($inout5,&QWP(0x50,$inp));
472	&lea	($inp,&DWP(0x60,$inp));
473&set_label("ecb_enc_loop6_enter");
474
475	&call	("_aesni_encrypt6");
476
477	&mov	($key,$key_);		# restore $key
478	&mov	($rounds,$rounds_);	# restore $rounds
479	&sub	($len,0x60);
480	&jnc	(&label("ecb_enc_loop6"));
481
482	&movups	(&QWP(0,$out),$inout0);
483	&movups	(&QWP(0x10,$out),$inout1);
484	&movups	(&QWP(0x20,$out),$inout2);
485	&movups	(&QWP(0x30,$out),$inout3);
486	&movups	(&QWP(0x40,$out),$inout4);
487	&movups	(&QWP(0x50,$out),$inout5);
488	&lea	($out,&DWP(0x60,$out));
489	&add	($len,0x60);
490	&jz	(&label("ecb_ret"));
491
492&set_label("ecb_enc_tail");
493	&movups	($inout0,&QWP(0,$inp));
494	&cmp	($len,0x20);
495	&jb	(&label("ecb_enc_one"));
496	&movups	($inout1,&QWP(0x10,$inp));
497	&je	(&label("ecb_enc_two"));
498	&movups	($inout2,&QWP(0x20,$inp));
499	&cmp	($len,0x40);
500	&jb	(&label("ecb_enc_three"));
501	&movups	($inout3,&QWP(0x30,$inp));
502	&je	(&label("ecb_enc_four"));
503	&movups	($inout4,&QWP(0x40,$inp));
504	&xorps	($inout5,$inout5);
505	&call	("_aesni_encrypt6");
506	&movups	(&QWP(0,$out),$inout0);
507	&movups	(&QWP(0x10,$out),$inout1);
508	&movups	(&QWP(0x20,$out),$inout2);
509	&movups	(&QWP(0x30,$out),$inout3);
510	&movups	(&QWP(0x40,$out),$inout4);
511	jmp	(&label("ecb_ret"));
512
513&set_label("ecb_enc_one",16);
514	if ($inline)
515	{   &aesni_inline_generate1("enc");	}
516	else
517	{   &call	("_aesni_encrypt1");	}
518	&movups	(&QWP(0,$out),$inout0);
519	&jmp	(&label("ecb_ret"));
520
521&set_label("ecb_enc_two",16);
522	&call	("_aesni_encrypt2");
523	&movups	(&QWP(0,$out),$inout0);
524	&movups	(&QWP(0x10,$out),$inout1);
525	&jmp	(&label("ecb_ret"));
526
527&set_label("ecb_enc_three",16);
528	&call	("_aesni_encrypt3");
529	&movups	(&QWP(0,$out),$inout0);
530	&movups	(&QWP(0x10,$out),$inout1);
531	&movups	(&QWP(0x20,$out),$inout2);
532	&jmp	(&label("ecb_ret"));
533
534&set_label("ecb_enc_four",16);
535	&call	("_aesni_encrypt4");
536	&movups	(&QWP(0,$out),$inout0);
537	&movups	(&QWP(0x10,$out),$inout1);
538	&movups	(&QWP(0x20,$out),$inout2);
539	&movups	(&QWP(0x30,$out),$inout3);
540	&jmp	(&label("ecb_ret"));
541######################################################################
542&set_label("ecb_decrypt",16);
543	&mov	($key_,$key);		# backup $key
544	&mov	($rounds_,$rounds);	# backup $rounds
545	&cmp	($len,0x60);
546	&jb	(&label("ecb_dec_tail"));
547
548	&movdqu	($inout0,&QWP(0,$inp));
549	&movdqu	($inout1,&QWP(0x10,$inp));
550	&movdqu	($inout2,&QWP(0x20,$inp));
551	&movdqu	($inout3,&QWP(0x30,$inp));
552	&movdqu	($inout4,&QWP(0x40,$inp));
553	&movdqu	($inout5,&QWP(0x50,$inp));
554	&lea	($inp,&DWP(0x60,$inp));
555	&sub	($len,0x60);
556	&jmp	(&label("ecb_dec_loop6_enter"));
557
558&set_label("ecb_dec_loop6",16);
559	&movups	(&QWP(0,$out),$inout0);
560	&movdqu	($inout0,&QWP(0,$inp));
561	&movups	(&QWP(0x10,$out),$inout1);
562	&movdqu	($inout1,&QWP(0x10,$inp));
563	&movups	(&QWP(0x20,$out),$inout2);
564	&movdqu	($inout2,&QWP(0x20,$inp));
565	&movups	(&QWP(0x30,$out),$inout3);
566	&movdqu	($inout3,&QWP(0x30,$inp));
567	&movups	(&QWP(0x40,$out),$inout4);
568	&movdqu	($inout4,&QWP(0x40,$inp));
569	&movups	(&QWP(0x50,$out),$inout5);
570	&lea	($out,&DWP(0x60,$out));
571	&movdqu	($inout5,&QWP(0x50,$inp));
572	&lea	($inp,&DWP(0x60,$inp));
573&set_label("ecb_dec_loop6_enter");
574
575	&call	("_aesni_decrypt6");
576
577	&mov	($key,$key_);		# restore $key
578	&mov	($rounds,$rounds_);	# restore $rounds
579	&sub	($len,0x60);
580	&jnc	(&label("ecb_dec_loop6"));
581
582	&movups	(&QWP(0,$out),$inout0);
583	&movups	(&QWP(0x10,$out),$inout1);
584	&movups	(&QWP(0x20,$out),$inout2);
585	&movups	(&QWP(0x30,$out),$inout3);
586	&movups	(&QWP(0x40,$out),$inout4);
587	&movups	(&QWP(0x50,$out),$inout5);
588	&lea	($out,&DWP(0x60,$out));
589	&add	($len,0x60);
590	&jz	(&label("ecb_ret"));
591
592&set_label("ecb_dec_tail");
593	&movups	($inout0,&QWP(0,$inp));
594	&cmp	($len,0x20);
595	&jb	(&label("ecb_dec_one"));
596	&movups	($inout1,&QWP(0x10,$inp));
597	&je	(&label("ecb_dec_two"));
598	&movups	($inout2,&QWP(0x20,$inp));
599	&cmp	($len,0x40);
600	&jb	(&label("ecb_dec_three"));
601	&movups	($inout3,&QWP(0x30,$inp));
602	&je	(&label("ecb_dec_four"));
603	&movups	($inout4,&QWP(0x40,$inp));
604	&xorps	($inout5,$inout5);
605	&call	("_aesni_decrypt6");
606	&movups	(&QWP(0,$out),$inout0);
607	&movups	(&QWP(0x10,$out),$inout1);
608	&movups	(&QWP(0x20,$out),$inout2);
609	&movups	(&QWP(0x30,$out),$inout3);
610	&movups	(&QWP(0x40,$out),$inout4);
611	&jmp	(&label("ecb_ret"));
612
613&set_label("ecb_dec_one",16);
614	if ($inline)
615	{   &aesni_inline_generate1("dec");	}
616	else
617	{   &call	("_aesni_decrypt1");	}
618	&movups	(&QWP(0,$out),$inout0);
619	&jmp	(&label("ecb_ret"));
620
621&set_label("ecb_dec_two",16);
622	&call	("_aesni_decrypt2");
623	&movups	(&QWP(0,$out),$inout0);
624	&movups	(&QWP(0x10,$out),$inout1);
625	&jmp	(&label("ecb_ret"));
626
627&set_label("ecb_dec_three",16);
628	&call	("_aesni_decrypt3");
629	&movups	(&QWP(0,$out),$inout0);
630	&movups	(&QWP(0x10,$out),$inout1);
631	&movups	(&QWP(0x20,$out),$inout2);
632	&jmp	(&label("ecb_ret"));
633
634&set_label("ecb_dec_four",16);
635	&call	("_aesni_decrypt4");
636	&movups	(&QWP(0,$out),$inout0);
637	&movups	(&QWP(0x10,$out),$inout1);
638	&movups	(&QWP(0x20,$out),$inout2);
639	&movups	(&QWP(0x30,$out),$inout3);
640
641&set_label("ecb_ret");
642	&pxor	("xmm0","xmm0");		# clear register bank
643	&pxor	("xmm1","xmm1");
644	&pxor	("xmm2","xmm2");
645	&pxor	("xmm3","xmm3");
646	&pxor	("xmm4","xmm4");
647	&pxor	("xmm5","xmm5");
648	&pxor	("xmm6","xmm6");
649	&pxor	("xmm7","xmm7");
650&function_end("aesni_ecb_encrypt");
651
652######################################################################
653# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
654#                         size_t blocks, const AES_KEY *key,
655#                         const char *ivec,char *cmac);
656#
657# Handles only complete blocks, operates on 64-bit counter and
658# does not update *ivec! Nor does it finalize CMAC value
659# (see engine/eng_aesni.c for details)
660#
661{ my $cmac=$inout1;
662&function_begin("aesni_ccm64_encrypt_blocks");
663	&mov	($inp,&wparam(0));
664	&mov	($out,&wparam(1));
665	&mov	($len,&wparam(2));
666	&mov	($key,&wparam(3));
667	&mov	($rounds_,&wparam(4));
668	&mov	($rounds,&wparam(5));
669	&mov	($key_,"esp");
670	&sub	("esp",60);
671	&and	("esp",-16);			# align stack
672	&mov	(&DWP(48,"esp"),$key_);
673
674	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
675	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
676	&mov	($rounds,&DWP(240,$key));
677
678	# compose byte-swap control mask for pshufb on stack
679	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
680	&mov	(&DWP(4,"esp"),0x08090a0b);
681	&mov	(&DWP(8,"esp"),0x04050607);
682	&mov	(&DWP(12,"esp"),0x00010203);
683
684	# compose counter increment vector on stack
685	&mov	($rounds_,1);
686	&xor	($key_,$key_);
687	&mov	(&DWP(16,"esp"),$rounds_);
688	&mov	(&DWP(20,"esp"),$key_);
689	&mov	(&DWP(24,"esp"),$key_);
690	&mov	(&DWP(28,"esp"),$key_);
691
692	&shl	($rounds,4);
693	&mov	($rounds_,16);
694	&lea	($key_,&DWP(0,$key));
695	&movdqa	($inout3,&QWP(0,"esp"));
696	&movdqa	($inout0,$ivec);
697	&lea	($key,&DWP(32,$key,$rounds));
698	&sub	($rounds_,$rounds);
699	&pshufb	($ivec,$inout3);
700
701&set_label("ccm64_enc_outer");
702	&$movekey	($rndkey0,&QWP(0,$key_));
703	&mov		($rounds,$rounds_);
704	&movups		($in0,&QWP(0,$inp));
705
706	&xorps		($inout0,$rndkey0);
707	&$movekey	($rndkey1,&QWP(16,$key_));
708	&xorps		($rndkey0,$in0);
709	&xorps		($cmac,$rndkey0);		# cmac^=inp
710	&$movekey	($rndkey0,&QWP(32,$key_));
711
712&set_label("ccm64_enc2_loop");
713	&aesenc		($inout0,$rndkey1);
714	&aesenc		($cmac,$rndkey1);
715	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
716	&add		($rounds,32);
717	&aesenc		($inout0,$rndkey0);
718	&aesenc		($cmac,$rndkey0);
719	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
720	&jnz		(&label("ccm64_enc2_loop"));
721	&aesenc		($inout0,$rndkey1);
722	&aesenc		($cmac,$rndkey1);
723	&paddq		($ivec,&QWP(16,"esp"));
724	&dec		($len);
725	&aesenclast	($inout0,$rndkey0);
726	&aesenclast	($cmac,$rndkey0);
727
728	&lea	($inp,&DWP(16,$inp));
729	&xorps	($in0,$inout0);			# inp^=E(ivec)
730	&movdqa	($inout0,$ivec);
731	&movups	(&QWP(0,$out),$in0);		# save output
732	&pshufb	($inout0,$inout3);
733	&lea	($out,&DWP(16,$out));
734	&jnz	(&label("ccm64_enc_outer"));
735
736	&mov	("esp",&DWP(48,"esp"));
737	&mov	($out,&wparam(5));
738	&movups	(&QWP(0,$out),$cmac);
739
740	&pxor	("xmm0","xmm0");		# clear register bank
741	&pxor	("xmm1","xmm1");
742	&pxor	("xmm2","xmm2");
743	&pxor	("xmm3","xmm3");
744	&pxor	("xmm4","xmm4");
745	&pxor	("xmm5","xmm5");
746	&pxor	("xmm6","xmm6");
747	&pxor	("xmm7","xmm7");
748&function_end("aesni_ccm64_encrypt_blocks");
749
750&function_begin("aesni_ccm64_decrypt_blocks");
751	&mov	($inp,&wparam(0));
752	&mov	($out,&wparam(1));
753	&mov	($len,&wparam(2));
754	&mov	($key,&wparam(3));
755	&mov	($rounds_,&wparam(4));
756	&mov	($rounds,&wparam(5));
757	&mov	($key_,"esp");
758	&sub	("esp",60);
759	&and	("esp",-16);			# align stack
760	&mov	(&DWP(48,"esp"),$key_);
761
762	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
763	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
764	&mov	($rounds,&DWP(240,$key));
765
766	# compose byte-swap control mask for pshufb on stack
767	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
768	&mov	(&DWP(4,"esp"),0x08090a0b);
769	&mov	(&DWP(8,"esp"),0x04050607);
770	&mov	(&DWP(12,"esp"),0x00010203);
771
772	# compose counter increment vector on stack
773	&mov	($rounds_,1);
774	&xor	($key_,$key_);
775	&mov	(&DWP(16,"esp"),$rounds_);
776	&mov	(&DWP(20,"esp"),$key_);
777	&mov	(&DWP(24,"esp"),$key_);
778	&mov	(&DWP(28,"esp"),$key_);
779
780	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
781	&movdqa	($inout0,$ivec);
782
783	&mov	($key_,$key);
784	&mov	($rounds_,$rounds);
785
786	&pshufb	($ivec,$inout3);
787	if ($inline)
788	{   &aesni_inline_generate1("enc");	}
789	else
790	{   &call	("_aesni_encrypt1");	}
791	&shl	($rounds_,4);
792	&mov	($rounds,16);
793	&movups	($in0,&QWP(0,$inp));		# load inp
794	&paddq	($ivec,&QWP(16,"esp"));
795	&lea	($inp,&QWP(16,$inp));
796	&sub	($rounds,$rounds_);
797	&lea	($key,&DWP(32,$key_,$rounds_));
798	&mov	($rounds_,$rounds);
799	&jmp	(&label("ccm64_dec_outer"));
800
801&set_label("ccm64_dec_outer",16);
802	&xorps	($in0,$inout0);			# inp ^= E(ivec)
803	&movdqa	($inout0,$ivec);
804	&movups	(&QWP(0,$out),$in0);		# save output
805	&lea	($out,&DWP(16,$out));
806	&pshufb	($inout0,$inout3);
807
808	&sub	($len,1);
809	&jz	(&label("ccm64_dec_break"));
810
811	&$movekey	($rndkey0,&QWP(0,$key_));
812	&mov		($rounds,$rounds_);
813	&$movekey	($rndkey1,&QWP(16,$key_));
814	&xorps		($in0,$rndkey0);
815	&xorps		($inout0,$rndkey0);
816	&xorps		($cmac,$in0);		# cmac^=out
817	&$movekey	($rndkey0,&QWP(32,$key_));
818
819&set_label("ccm64_dec2_loop");
820	&aesenc		($inout0,$rndkey1);
821	&aesenc		($cmac,$rndkey1);
822	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
823	&add		($rounds,32);
824	&aesenc		($inout0,$rndkey0);
825	&aesenc		($cmac,$rndkey0);
826	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
827	&jnz		(&label("ccm64_dec2_loop"));
828	&movups		($in0,&QWP(0,$inp));	# load inp
829	&paddq		($ivec,&QWP(16,"esp"));
830	&aesenc		($inout0,$rndkey1);
831	&aesenc		($cmac,$rndkey1);
832	&aesenclast	($inout0,$rndkey0);
833	&aesenclast	($cmac,$rndkey0);
834	&lea		($inp,&QWP(16,$inp));
835	&jmp	(&label("ccm64_dec_outer"));
836
837&set_label("ccm64_dec_break",16);
838	&mov	($rounds,&DWP(240,$key_));
839	&mov	($key,$key_);
840	if ($inline)
841	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
842	else
843	{   &call	("_aesni_encrypt1",$cmac);	}
844
845	&mov	("esp",&DWP(48,"esp"));
846	&mov	($out,&wparam(5));
847	&movups	(&QWP(0,$out),$cmac);
848
849	&pxor	("xmm0","xmm0");		# clear register bank
850	&pxor	("xmm1","xmm1");
851	&pxor	("xmm2","xmm2");
852	&pxor	("xmm3","xmm3");
853	&pxor	("xmm4","xmm4");
854	&pxor	("xmm5","xmm5");
855	&pxor	("xmm6","xmm6");
856	&pxor	("xmm7","xmm7");
857&function_end("aesni_ccm64_decrypt_blocks");
858}
859
860######################################################################
861# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
862#                         size_t blocks, const AES_KEY *key,
863#                         const char *ivec);
864#
865# Handles only complete blocks, operates on 32-bit counter and
866# does not update *ivec! (see crypto/modes/ctr128.c for details)
867#
868# stack layout:
869#	0	pshufb mask
870#	16	vector addend: 0,6,6,6
871# 	32	counter-less ivec
872#	48	1st triplet of counter vector
873#	64	2nd triplet of counter vector
874#	80	saved %esp
875
876&function_begin("aesni_ctr32_encrypt_blocks");
877	&mov	($inp,&wparam(0));
878	&mov	($out,&wparam(1));
879	&mov	($len,&wparam(2));
880	&mov	($key,&wparam(3));
881	&mov	($rounds_,&wparam(4));
882	&mov	($key_,"esp");
883	&sub	("esp",88);
884	&and	("esp",-16);			# align stack
885	&mov	(&DWP(80,"esp"),$key_);
886
887	&cmp	($len,1);
888	&je	(&label("ctr32_one_shortcut"));
889
890	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
891
892	# compose byte-swap control mask for pshufb on stack
893	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
894	&mov	(&DWP(4,"esp"),0x08090a0b);
895	&mov	(&DWP(8,"esp"),0x04050607);
896	&mov	(&DWP(12,"esp"),0x00010203);
897
898	# compose counter increment vector on stack
899	&mov	($rounds,6);
900	&xor	($key_,$key_);
901	&mov	(&DWP(16,"esp"),$rounds);
902	&mov	(&DWP(20,"esp"),$rounds);
903	&mov	(&DWP(24,"esp"),$rounds);
904	&mov	(&DWP(28,"esp"),$key_);
905
906	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
907	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
908
909	&mov	($rounds,&DWP(240,$key));	# key->rounds
910
911	# compose 2 vectors of 3x32-bit counters
912	&bswap	($rounds_);
913	&pxor	($rndkey0,$rndkey0);
914	&pxor	($rndkey1,$rndkey1);
915	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
916	&pinsrd	($rndkey0,$rounds_,0);
917	&lea	($key_,&DWP(3,$rounds_));
918	&pinsrd	($rndkey1,$key_,0);
919	&inc	($rounds_);
920	&pinsrd	($rndkey0,$rounds_,1);
921	&inc	($key_);
922	&pinsrd	($rndkey1,$key_,1);
923	&inc	($rounds_);
924	&pinsrd	($rndkey0,$rounds_,2);
925	&inc	($key_);
926	&pinsrd	($rndkey1,$key_,2);
927	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
928	&pshufb	($rndkey0,$inout0);		# byte swap
929	&movdqu	($inout4,&QWP(0,$key));		# key[0]
930	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
931	&pshufb	($rndkey1,$inout0);		# byte swap
932
933	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
934	&pshufd	($inout1,$rndkey0,2<<6);
935	&cmp	($len,6);
936	&jb	(&label("ctr32_tail"));
937	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
938	&shl	($rounds,4);
939	&mov	($rounds_,16);
940	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
941	&mov	($key_,$key);			# backup $key
942	&sub	($rounds_,$rounds);		# backup twisted $rounds
943	&lea	($key,&DWP(32,$key,$rounds));
944	&sub	($len,6);
945	&jmp	(&label("ctr32_loop6"));
946
947&set_label("ctr32_loop6",16);
948	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
949	&pshufd	($inout2,$rndkey0,1<<6);
950	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
951	&pshufd	($inout3,$rndkey1,3<<6);
952	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
953	&pshufd	($inout4,$rndkey1,2<<6);
954	&pxor		($inout1,$rndkey0);
955	&pshufd	($inout5,$rndkey1,1<<6);
956	&$movekey	($rndkey1,&QWP(16,$key_));
957	&pxor		($inout2,$rndkey0);
958	&pxor		($inout3,$rndkey0);
959	&aesenc		($inout0,$rndkey1);
960	&pxor		($inout4,$rndkey0);
961	&pxor		($inout5,$rndkey0);
962	&aesenc		($inout1,$rndkey1);
963	&$movekey	($rndkey0,&QWP(32,$key_));
964	&mov		($rounds,$rounds_);
965	&aesenc		($inout2,$rndkey1);
966	&aesenc		($inout3,$rndkey1);
967	&aesenc		($inout4,$rndkey1);
968	&aesenc		($inout5,$rndkey1);
969
970	&call		(&label("_aesni_encrypt6_enter"));
971
972	&movups	($rndkey1,&QWP(0,$inp));
973	&movups	($rndkey0,&QWP(0x10,$inp));
974	&xorps	($inout0,$rndkey1);
975	&movups	($rndkey1,&QWP(0x20,$inp));
976	&xorps	($inout1,$rndkey0);
977	&movups	(&QWP(0,$out),$inout0);
978	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
979	&xorps	($inout2,$rndkey1);
980	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
981	&movups	(&QWP(0x10,$out),$inout1);
982	&movups	(&QWP(0x20,$out),$inout2);
983
984	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
985	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
986	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
987
988	&movups	($inout1,&QWP(0x30,$inp));
989	&movups	($inout2,&QWP(0x40,$inp));
990	&xorps	($inout3,$inout1);
991	&movups	($inout1,&QWP(0x50,$inp));
992	&lea	($inp,&DWP(0x60,$inp));
993	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
994	&pshufb	($rndkey0,$inout0);		# byte swap
995	&xorps	($inout4,$inout2);
996	&movups	(&QWP(0x30,$out),$inout3);
997	&xorps	($inout5,$inout1);
998	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
999	&pshufb	($rndkey1,$inout0);		# byte swap
1000	&movups	(&QWP(0x40,$out),$inout4);
1001	&pshufd	($inout0,$rndkey0,3<<6);
1002	&movups	(&QWP(0x50,$out),$inout5);
1003	&lea	($out,&DWP(0x60,$out));
1004
1005	&pshufd	($inout1,$rndkey0,2<<6);
1006	&sub	($len,6);
1007	&jnc	(&label("ctr32_loop6"));
1008
1009	&add	($len,6);
1010	&jz	(&label("ctr32_ret"));
1011	&movdqu	($inout5,&QWP(0,$key_));
1012	&mov	($key,$key_);
1013	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
1014	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1015
1016&set_label("ctr32_tail");
1017	&por	($inout0,$inout5);
1018	&cmp	($len,2);
1019	&jb	(&label("ctr32_one"));
1020
1021	&pshufd	($inout2,$rndkey0,1<<6);
1022	&por	($inout1,$inout5);
1023	&je	(&label("ctr32_two"));
1024
1025	&pshufd	($inout3,$rndkey1,3<<6);
1026	&por	($inout2,$inout5);
1027	&cmp	($len,4);
1028	&jb	(&label("ctr32_three"));
1029
1030	&pshufd	($inout4,$rndkey1,2<<6);
1031	&por	($inout3,$inout5);
1032	&je	(&label("ctr32_four"));
1033
1034	&por	($inout4,$inout5);
1035	&call	("_aesni_encrypt6");
1036	&movups	($rndkey1,&QWP(0,$inp));
1037	&movups	($rndkey0,&QWP(0x10,$inp));
1038	&xorps	($inout0,$rndkey1);
1039	&movups	($rndkey1,&QWP(0x20,$inp));
1040	&xorps	($inout1,$rndkey0);
1041	&movups	($rndkey0,&QWP(0x30,$inp));
1042	&xorps	($inout2,$rndkey1);
1043	&movups	($rndkey1,&QWP(0x40,$inp));
1044	&xorps	($inout3,$rndkey0);
1045	&movups	(&QWP(0,$out),$inout0);
1046	&xorps	($inout4,$rndkey1);
1047	&movups	(&QWP(0x10,$out),$inout1);
1048	&movups	(&QWP(0x20,$out),$inout2);
1049	&movups	(&QWP(0x30,$out),$inout3);
1050	&movups	(&QWP(0x40,$out),$inout4);
1051	&jmp	(&label("ctr32_ret"));
1052
1053&set_label("ctr32_one_shortcut",16);
1054	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
1055	&mov	($rounds,&DWP(240,$key));
1056
1057&set_label("ctr32_one");
1058	if ($inline)
1059	{   &aesni_inline_generate1("enc");	}
1060	else
1061	{   &call	("_aesni_encrypt1");	}
1062	&movups	($in0,&QWP(0,$inp));
1063	&xorps	($in0,$inout0);
1064	&movups	(&QWP(0,$out),$in0);
1065	&jmp	(&label("ctr32_ret"));
1066
1067&set_label("ctr32_two",16);
1068	&call	("_aesni_encrypt2");
1069	&movups	($inout3,&QWP(0,$inp));
1070	&movups	($inout4,&QWP(0x10,$inp));
1071	&xorps	($inout0,$inout3);
1072	&xorps	($inout1,$inout4);
1073	&movups	(&QWP(0,$out),$inout0);
1074	&movups	(&QWP(0x10,$out),$inout1);
1075	&jmp	(&label("ctr32_ret"));
1076
1077&set_label("ctr32_three",16);
1078	&call	("_aesni_encrypt3");
1079	&movups	($inout3,&QWP(0,$inp));
1080	&movups	($inout4,&QWP(0x10,$inp));
1081	&xorps	($inout0,$inout3);
1082	&movups	($inout5,&QWP(0x20,$inp));
1083	&xorps	($inout1,$inout4);
1084	&movups	(&QWP(0,$out),$inout0);
1085	&xorps	($inout2,$inout5);
1086	&movups	(&QWP(0x10,$out),$inout1);
1087	&movups	(&QWP(0x20,$out),$inout2);
1088	&jmp	(&label("ctr32_ret"));
1089
1090&set_label("ctr32_four",16);
1091	&call	("_aesni_encrypt4");
1092	&movups	($inout4,&QWP(0,$inp));
1093	&movups	($inout5,&QWP(0x10,$inp));
1094	&movups	($rndkey1,&QWP(0x20,$inp));
1095	&xorps	($inout0,$inout4);
1096	&movups	($rndkey0,&QWP(0x30,$inp));
1097	&xorps	($inout1,$inout5);
1098	&movups	(&QWP(0,$out),$inout0);
1099	&xorps	($inout2,$rndkey1);
1100	&movups	(&QWP(0x10,$out),$inout1);
1101	&xorps	($inout3,$rndkey0);
1102	&movups	(&QWP(0x20,$out),$inout2);
1103	&movups	(&QWP(0x30,$out),$inout3);
1104
1105&set_label("ctr32_ret");
1106	&pxor	("xmm0","xmm0");		# clear register bank
1107	&pxor	("xmm1","xmm1");
1108	&pxor	("xmm2","xmm2");
1109	&pxor	("xmm3","xmm3");
1110	&pxor	("xmm4","xmm4");
1111	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
1112	&pxor	("xmm5","xmm5");
1113	&movdqa	(&QWP(48,"esp"),"xmm0");
1114	&pxor	("xmm6","xmm6");
1115	&movdqa	(&QWP(64,"esp"),"xmm0");
1116	&pxor	("xmm7","xmm7");
1117	&mov	("esp",&DWP(80,"esp"));
1118&function_end("aesni_ctr32_encrypt_blocks");
1119
1120######################################################################
1121# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1122#	const AES_KEY *key1, const AES_KEY *key2
1123#	const unsigned char iv[16]);
1124#
1125{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1126
1127&function_begin("aesni_xts_encrypt");
1128	&mov	($key,&wparam(4));		# key2
1129	&mov	($inp,&wparam(5));		# clear-text tweak
1130
1131	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1132	&movups	($inout0,&QWP(0,$inp));
1133	if ($inline)
1134	{   &aesni_inline_generate1("enc");	}
1135	else
1136	{   &call	("_aesni_encrypt1");	}
1137
1138	&mov	($inp,&wparam(0));
1139	&mov	($out,&wparam(1));
1140	&mov	($len,&wparam(2));
1141	&mov	($key,&wparam(3));		# key1
1142
1143	&mov	($key_,"esp");
1144	&sub	("esp",16*7+8);
1145	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1146	&and	("esp",-16);			# align stack
1147
1148	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1149	&mov	(&DWP(16*6+4,"esp"),0);
1150	&mov	(&DWP(16*6+8,"esp"),1);
1151	&mov	(&DWP(16*6+12,"esp"),0);
1152	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1153	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1154
1155	&movdqa	($tweak,$inout0);
1156	&pxor	($twtmp,$twtmp);
1157	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1158	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1159
1160	&and	($len,-16);
1161	&mov	($key_,$key);			# backup $key
1162	&mov	($rounds_,$rounds);		# backup $rounds
1163	&sub	($len,16*6);
1164	&jc	(&label("xts_enc_short"));
1165
1166	&shl	($rounds,4);
1167	&mov	($rounds_,16);
1168	&sub	($rounds_,$rounds);
1169	&lea	($key,&DWP(32,$key,$rounds));
1170	&jmp	(&label("xts_enc_loop6"));
1171
1172&set_label("xts_enc_loop6",16);
1173	for ($i=0;$i<4;$i++) {
1174	    &pshufd	($twres,$twtmp,0x13);
1175	    &pxor	($twtmp,$twtmp);
1176	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1177	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1178	    &pand	($twres,$twmask);	# isolate carry and residue
1179	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1180	    &pxor	($tweak,$twres);
1181	}
1182	&pshufd	($inout5,$twtmp,0x13);
1183	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1184	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1185	 &$movekey	($rndkey0,&QWP(0,$key_));
1186	&pand	($inout5,$twmask);		# isolate carry and residue
1187	 &movups	($inout0,&QWP(0,$inp));	# load input
1188	&pxor	($inout5,$tweak);
1189
1190	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1191	&mov	($rounds,$rounds_);		# restore $rounds
1192	&movdqu	($inout1,&QWP(16*1,$inp));
1193	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1194	&movdqu	($inout2,&QWP(16*2,$inp));
1195	 &pxor		($inout1,$rndkey0);
1196	&movdqu	($inout3,&QWP(16*3,$inp));
1197	 &pxor		($inout2,$rndkey0);
1198	&movdqu	($inout4,&QWP(16*4,$inp));
1199	 &pxor		($inout3,$rndkey0);
1200	&movdqu	($rndkey1,&QWP(16*5,$inp));
1201	 &pxor		($inout4,$rndkey0);
1202	&lea	($inp,&DWP(16*6,$inp));
1203	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1204	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1205	&pxor	($inout5,$rndkey1);
1206
1207	 &$movekey	($rndkey1,&QWP(16,$key_));
1208	&pxor	($inout1,&QWP(16*1,"esp"));
1209	&pxor	($inout2,&QWP(16*2,"esp"));
1210	 &aesenc	($inout0,$rndkey1);
1211	&pxor	($inout3,&QWP(16*3,"esp"));
1212	&pxor	($inout4,&QWP(16*4,"esp"));
1213	 &aesenc	($inout1,$rndkey1);
1214	&pxor		($inout5,$rndkey0);
1215	 &$movekey	($rndkey0,&QWP(32,$key_));
1216	 &aesenc	($inout2,$rndkey1);
1217	 &aesenc	($inout3,$rndkey1);
1218	 &aesenc	($inout4,$rndkey1);
1219	 &aesenc	($inout5,$rndkey1);
1220	&call		(&label("_aesni_encrypt6_enter"));
1221
1222	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1223       &pxor	($twtmp,$twtmp);
1224	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1225       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1226	&xorps	($inout1,&QWP(16*1,"esp"));
1227	&movups	(&QWP(16*0,$out),$inout0);	# write output
1228	&xorps	($inout2,&QWP(16*2,"esp"));
1229	&movups	(&QWP(16*1,$out),$inout1);
1230	&xorps	($inout3,&QWP(16*3,"esp"));
1231	&movups	(&QWP(16*2,$out),$inout2);
1232	&xorps	($inout4,&QWP(16*4,"esp"));
1233	&movups	(&QWP(16*3,$out),$inout3);
1234	&xorps	($inout5,$tweak);
1235	&movups	(&QWP(16*4,$out),$inout4);
1236       &pshufd	($twres,$twtmp,0x13);
1237	&movups	(&QWP(16*5,$out),$inout5);
1238	&lea	($out,&DWP(16*6,$out));
1239       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1240
1241	&pxor	($twtmp,$twtmp);
1242	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1243	&pand	($twres,$twmask);		# isolate carry and residue
1244	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1245	&pxor	($tweak,$twres);
1246
1247	&sub	($len,16*6);
1248	&jnc	(&label("xts_enc_loop6"));
1249
1250	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1251	&mov	($key,$key_);			# restore $key
1252	&mov	($rounds_,$rounds);
1253
1254&set_label("xts_enc_short");
1255	&add	($len,16*6);
1256	&jz	(&label("xts_enc_done6x"));
1257
1258	&movdqa	($inout3,$tweak);		# put aside previous tweak
1259	&cmp	($len,0x20);
1260	&jb	(&label("xts_enc_one"));
1261
1262	&pshufd	($twres,$twtmp,0x13);
1263	&pxor	($twtmp,$twtmp);
1264	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1265	&pand	($twres,$twmask);		# isolate carry and residue
1266	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1267	&pxor	($tweak,$twres);
1268	&je	(&label("xts_enc_two"));
1269
1270	&pshufd	($twres,$twtmp,0x13);
1271	&pxor	($twtmp,$twtmp);
1272	&movdqa	($inout4,$tweak);		# put aside previous tweak
1273	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1274	&pand	($twres,$twmask);		# isolate carry and residue
1275	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1276	&pxor	($tweak,$twres);
1277	&cmp	($len,0x40);
1278	&jb	(&label("xts_enc_three"));
1279
1280	&pshufd	($twres,$twtmp,0x13);
1281	&pxor	($twtmp,$twtmp);
1282	&movdqa	($inout5,$tweak);		# put aside previous tweak
1283	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1284	&pand	($twres,$twmask);		# isolate carry and residue
1285	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1286	&pxor	($tweak,$twres);
1287	&movdqa	(&QWP(16*0,"esp"),$inout3);
1288	&movdqa	(&QWP(16*1,"esp"),$inout4);
1289	&je	(&label("xts_enc_four"));
1290
1291	&movdqa	(&QWP(16*2,"esp"),$inout5);
1292	&pshufd	($inout5,$twtmp,0x13);
1293	&movdqa	(&QWP(16*3,"esp"),$tweak);
1294	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1295	&pand	($inout5,$twmask);		# isolate carry and residue
1296	&pxor	($inout5,$tweak);
1297
1298	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1299	&movdqu	($inout1,&QWP(16*1,$inp));
1300	&movdqu	($inout2,&QWP(16*2,$inp));
1301	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1302	&movdqu	($inout3,&QWP(16*3,$inp));
1303	&pxor	($inout1,&QWP(16*1,"esp"));
1304	&movdqu	($inout4,&QWP(16*4,$inp));
1305	&pxor	($inout2,&QWP(16*2,"esp"));
1306	&lea	($inp,&DWP(16*5,$inp));
1307	&pxor	($inout3,&QWP(16*3,"esp"));
1308	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1309	&pxor	($inout4,$inout5);
1310
1311	&call	("_aesni_encrypt6");
1312
1313	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1314	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1315	&xorps	($inout1,&QWP(16*1,"esp"));
1316	&xorps	($inout2,&QWP(16*2,"esp"));
1317	&movups	(&QWP(16*0,$out),$inout0);	# write output
1318	&xorps	($inout3,&QWP(16*3,"esp"));
1319	&movups	(&QWP(16*1,$out),$inout1);
1320	&xorps	($inout4,$tweak);
1321	&movups	(&QWP(16*2,$out),$inout2);
1322	&movups	(&QWP(16*3,$out),$inout3);
1323	&movups	(&QWP(16*4,$out),$inout4);
1324	&lea	($out,&DWP(16*5,$out));
1325	&jmp	(&label("xts_enc_done"));
1326
1327&set_label("xts_enc_one",16);
1328	&movups	($inout0,&QWP(16*0,$inp));	# load input
1329	&lea	($inp,&DWP(16*1,$inp));
1330	&xorps	($inout0,$inout3);		# input^=tweak
1331	if ($inline)
1332	{   &aesni_inline_generate1("enc");	}
1333	else
1334	{   &call	("_aesni_encrypt1");	}
1335	&xorps	($inout0,$inout3);		# output^=tweak
1336	&movups	(&QWP(16*0,$out),$inout0);	# write output
1337	&lea	($out,&DWP(16*1,$out));
1338
1339	&movdqa	($tweak,$inout3);		# last tweak
1340	&jmp	(&label("xts_enc_done"));
1341
1342&set_label("xts_enc_two",16);
1343	&movaps	($inout4,$tweak);		# put aside last tweak
1344
1345	&movups	($inout0,&QWP(16*0,$inp));	# load input
1346	&movups	($inout1,&QWP(16*1,$inp));
1347	&lea	($inp,&DWP(16*2,$inp));
1348	&xorps	($inout0,$inout3);		# input^=tweak
1349	&xorps	($inout1,$inout4);
1350
1351	&call	("_aesni_encrypt2");
1352
1353	&xorps	($inout0,$inout3);		# output^=tweak
1354	&xorps	($inout1,$inout4);
1355	&movups	(&QWP(16*0,$out),$inout0);	# write output
1356	&movups	(&QWP(16*1,$out),$inout1);
1357	&lea	($out,&DWP(16*2,$out));
1358
1359	&movdqa	($tweak,$inout4);		# last tweak
1360	&jmp	(&label("xts_enc_done"));
1361
1362&set_label("xts_enc_three",16);
1363	&movaps	($inout5,$tweak);		# put aside last tweak
1364	&movups	($inout0,&QWP(16*0,$inp));	# load input
1365	&movups	($inout1,&QWP(16*1,$inp));
1366	&movups	($inout2,&QWP(16*2,$inp));
1367	&lea	($inp,&DWP(16*3,$inp));
1368	&xorps	($inout0,$inout3);		# input^=tweak
1369	&xorps	($inout1,$inout4);
1370	&xorps	($inout2,$inout5);
1371
1372	&call	("_aesni_encrypt3");
1373
1374	&xorps	($inout0,$inout3);		# output^=tweak
1375	&xorps	($inout1,$inout4);
1376	&xorps	($inout2,$inout5);
1377	&movups	(&QWP(16*0,$out),$inout0);	# write output
1378	&movups	(&QWP(16*1,$out),$inout1);
1379	&movups	(&QWP(16*2,$out),$inout2);
1380	&lea	($out,&DWP(16*3,$out));
1381
1382	&movdqa	($tweak,$inout5);		# last tweak
1383	&jmp	(&label("xts_enc_done"));
1384
1385&set_label("xts_enc_four",16);
1386	&movaps	($inout4,$tweak);		# put aside last tweak
1387
1388	&movups	($inout0,&QWP(16*0,$inp));	# load input
1389	&movups	($inout1,&QWP(16*1,$inp));
1390	&movups	($inout2,&QWP(16*2,$inp));
1391	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1392	&movups	($inout3,&QWP(16*3,$inp));
1393	&lea	($inp,&DWP(16*4,$inp));
1394	&xorps	($inout1,&QWP(16*1,"esp"));
1395	&xorps	($inout2,$inout5);
1396	&xorps	($inout3,$inout4);
1397
1398	&call	("_aesni_encrypt4");
1399
1400	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1401	&xorps	($inout1,&QWP(16*1,"esp"));
1402	&xorps	($inout2,$inout5);
1403	&movups	(&QWP(16*0,$out),$inout0);	# write output
1404	&xorps	($inout3,$inout4);
1405	&movups	(&QWP(16*1,$out),$inout1);
1406	&movups	(&QWP(16*2,$out),$inout2);
1407	&movups	(&QWP(16*3,$out),$inout3);
1408	&lea	($out,&DWP(16*4,$out));
1409
1410	&movdqa	($tweak,$inout4);		# last tweak
1411	&jmp	(&label("xts_enc_done"));
1412
1413&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
1414	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1415	&and	($len,15);
1416	&jz	(&label("xts_enc_ret"));
1417	&movdqa	($inout3,$tweak);
1418	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1419	&jmp	(&label("xts_enc_steal"));
1420
1421&set_label("xts_enc_done",16);
1422	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1423	&pxor	($twtmp,$twtmp);
1424	&and	($len,15);
1425	&jz	(&label("xts_enc_ret"));
1426
1427	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1428	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1429	&pshufd	($inout3,$twtmp,0x13);
1430	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1431	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
1432	&pxor	($inout3,$tweak);
1433
1434&set_label("xts_enc_steal");
1435	&movz	($rounds,&BP(0,$inp));
1436	&movz	($key,&BP(-16,$out));
1437	&lea	($inp,&DWP(1,$inp));
1438	&mov	(&BP(-16,$out),&LB($rounds));
1439	&mov	(&BP(0,$out),&LB($key));
1440	&lea	($out,&DWP(1,$out));
1441	&sub	($len,1);
1442	&jnz	(&label("xts_enc_steal"));
1443
1444	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1445	&mov	($key,$key_);			# restore $key
1446	&mov	($rounds,$rounds_);		# restore $rounds
1447
1448	&movups	($inout0,&QWP(-16,$out));	# load input
1449	&xorps	($inout0,$inout3);		# input^=tweak
1450	if ($inline)
1451	{   &aesni_inline_generate1("enc");	}
1452	else
1453	{   &call	("_aesni_encrypt1");	}
1454	&xorps	($inout0,$inout3);		# output^=tweak
1455	&movups	(&QWP(-16,$out),$inout0);	# write output
1456
1457&set_label("xts_enc_ret");
1458	&pxor	("xmm0","xmm0");		# clear register bank
1459	&pxor	("xmm1","xmm1");
1460	&pxor	("xmm2","xmm2");
1461	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
1462	&pxor	("xmm3","xmm3");
1463	&movdqa	(&QWP(16*1,"esp"),"xmm0");
1464	&pxor	("xmm4","xmm4");
1465	&movdqa	(&QWP(16*2,"esp"),"xmm0");
1466	&pxor	("xmm5","xmm5");
1467	&movdqa	(&QWP(16*3,"esp"),"xmm0");
1468	&pxor	("xmm6","xmm6");
1469	&movdqa	(&QWP(16*4,"esp"),"xmm0");
1470	&pxor	("xmm7","xmm7");
1471	&movdqa	(&QWP(16*5,"esp"),"xmm0");
1472	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1473&function_end("aesni_xts_encrypt");
1474
1475&function_begin("aesni_xts_decrypt");
1476	&mov	($key,&wparam(4));		# key2
1477	&mov	($inp,&wparam(5));		# clear-text tweak
1478
1479	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1480	&movups	($inout0,&QWP(0,$inp));
1481	if ($inline)
1482	{   &aesni_inline_generate1("enc");	}
1483	else
1484	{   &call	("_aesni_encrypt1");	}
1485
1486	&mov	($inp,&wparam(0));
1487	&mov	($out,&wparam(1));
1488	&mov	($len,&wparam(2));
1489	&mov	($key,&wparam(3));		# key1
1490
1491	&mov	($key_,"esp");
1492	&sub	("esp",16*7+8);
1493	&and	("esp",-16);			# align stack
1494
1495	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
1496	&test	($len,15);
1497	&setnz	(&LB($rounds_));
1498	&shl	($rounds_,4);
1499	&sub	($len,$rounds_);
1500
1501	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1502	&mov	(&DWP(16*6+4,"esp"),0);
1503	&mov	(&DWP(16*6+8,"esp"),1);
1504	&mov	(&DWP(16*6+12,"esp"),0);
1505	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1506	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1507
1508	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1509	&mov	($key_,$key);			# backup $key
1510	&mov	($rounds_,$rounds);		# backup $rounds
1511
1512	&movdqa	($tweak,$inout0);
1513	&pxor	($twtmp,$twtmp);
1514	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1515	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1516
1517	&and	($len,-16);
1518	&sub	($len,16*6);
1519	&jc	(&label("xts_dec_short"));
1520
1521	&shl	($rounds,4);
1522	&mov	($rounds_,16);
1523	&sub	($rounds_,$rounds);
1524	&lea	($key,&DWP(32,$key,$rounds));
1525	&jmp	(&label("xts_dec_loop6"));
1526
1527&set_label("xts_dec_loop6",16);
1528	for ($i=0;$i<4;$i++) {
1529	    &pshufd	($twres,$twtmp,0x13);
1530	    &pxor	($twtmp,$twtmp);
1531	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1532	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1533	    &pand	($twres,$twmask);	# isolate carry and residue
1534	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1535	    &pxor	($tweak,$twres);
1536	}
1537	&pshufd	($inout5,$twtmp,0x13);
1538	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1539	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1540	 &$movekey	($rndkey0,&QWP(0,$key_));
1541	&pand	($inout5,$twmask);		# isolate carry and residue
1542	 &movups	($inout0,&QWP(0,$inp));	# load input
1543	&pxor	($inout5,$tweak);
1544
1545	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1546	&mov	($rounds,$rounds_);
1547	&movdqu	($inout1,&QWP(16*1,$inp));
1548	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1549	&movdqu	($inout2,&QWP(16*2,$inp));
1550	 &pxor		($inout1,$rndkey0);
1551	&movdqu	($inout3,&QWP(16*3,$inp));
1552	 &pxor		($inout2,$rndkey0);
1553	&movdqu	($inout4,&QWP(16*4,$inp));
1554	 &pxor		($inout3,$rndkey0);
1555	&movdqu	($rndkey1,&QWP(16*5,$inp));
1556	 &pxor		($inout4,$rndkey0);
1557	&lea	($inp,&DWP(16*6,$inp));
1558	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1559	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1560	&pxor	($inout5,$rndkey1);
1561
1562	 &$movekey	($rndkey1,&QWP(16,$key_));
1563	&pxor	($inout1,&QWP(16*1,"esp"));
1564	&pxor	($inout2,&QWP(16*2,"esp"));
1565	 &aesdec	($inout0,$rndkey1);
1566	&pxor	($inout3,&QWP(16*3,"esp"));
1567	&pxor	($inout4,&QWP(16*4,"esp"));
1568	 &aesdec	($inout1,$rndkey1);
1569	&pxor		($inout5,$rndkey0);
1570	 &$movekey	($rndkey0,&QWP(32,$key_));
1571	 &aesdec	($inout2,$rndkey1);
1572	 &aesdec	($inout3,$rndkey1);
1573	 &aesdec	($inout4,$rndkey1);
1574	 &aesdec	($inout5,$rndkey1);
1575	&call		(&label("_aesni_decrypt6_enter"));
1576
1577	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1578       &pxor	($twtmp,$twtmp);
1579	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1580       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1581	&xorps	($inout1,&QWP(16*1,"esp"));
1582	&movups	(&QWP(16*0,$out),$inout0);	# write output
1583	&xorps	($inout2,&QWP(16*2,"esp"));
1584	&movups	(&QWP(16*1,$out),$inout1);
1585	&xorps	($inout3,&QWP(16*3,"esp"));
1586	&movups	(&QWP(16*2,$out),$inout2);
1587	&xorps	($inout4,&QWP(16*4,"esp"));
1588	&movups	(&QWP(16*3,$out),$inout3);
1589	&xorps	($inout5,$tweak);
1590	&movups	(&QWP(16*4,$out),$inout4);
1591       &pshufd	($twres,$twtmp,0x13);
1592	&movups	(&QWP(16*5,$out),$inout5);
1593	&lea	($out,&DWP(16*6,$out));
1594       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1595
1596	&pxor	($twtmp,$twtmp);
1597	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1598	&pand	($twres,$twmask);		# isolate carry and residue
1599	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1600	&pxor	($tweak,$twres);
1601
1602	&sub	($len,16*6);
1603	&jnc	(&label("xts_dec_loop6"));
1604
1605	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1606	&mov	($key,$key_);			# restore $key
1607	&mov	($rounds_,$rounds);
1608
1609&set_label("xts_dec_short");
1610	&add	($len,16*6);
1611	&jz	(&label("xts_dec_done6x"));
1612
1613	&movdqa	($inout3,$tweak);		# put aside previous tweak
1614	&cmp	($len,0x20);
1615	&jb	(&label("xts_dec_one"));
1616
1617	&pshufd	($twres,$twtmp,0x13);
1618	&pxor	($twtmp,$twtmp);
1619	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1620	&pand	($twres,$twmask);		# isolate carry and residue
1621	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1622	&pxor	($tweak,$twres);
1623	&je	(&label("xts_dec_two"));
1624
1625	&pshufd	($twres,$twtmp,0x13);
1626	&pxor	($twtmp,$twtmp);
1627	&movdqa	($inout4,$tweak);		# put aside previous tweak
1628	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1629	&pand	($twres,$twmask);		# isolate carry and residue
1630	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1631	&pxor	($tweak,$twres);
1632	&cmp	($len,0x40);
1633	&jb	(&label("xts_dec_three"));
1634
1635	&pshufd	($twres,$twtmp,0x13);
1636	&pxor	($twtmp,$twtmp);
1637	&movdqa	($inout5,$tweak);		# put aside previous tweak
1638	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1639	&pand	($twres,$twmask);		# isolate carry and residue
1640	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1641	&pxor	($tweak,$twres);
1642	&movdqa	(&QWP(16*0,"esp"),$inout3);
1643	&movdqa	(&QWP(16*1,"esp"),$inout4);
1644	&je	(&label("xts_dec_four"));
1645
1646	&movdqa	(&QWP(16*2,"esp"),$inout5);
1647	&pshufd	($inout5,$twtmp,0x13);
1648	&movdqa	(&QWP(16*3,"esp"),$tweak);
1649	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1650	&pand	($inout5,$twmask);		# isolate carry and residue
1651	&pxor	($inout5,$tweak);
1652
1653	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1654	&movdqu	($inout1,&QWP(16*1,$inp));
1655	&movdqu	($inout2,&QWP(16*2,$inp));
1656	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1657	&movdqu	($inout3,&QWP(16*3,$inp));
1658	&pxor	($inout1,&QWP(16*1,"esp"));
1659	&movdqu	($inout4,&QWP(16*4,$inp));
1660	&pxor	($inout2,&QWP(16*2,"esp"));
1661	&lea	($inp,&DWP(16*5,$inp));
1662	&pxor	($inout3,&QWP(16*3,"esp"));
1663	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1664	&pxor	($inout4,$inout5);
1665
1666	&call	("_aesni_decrypt6");
1667
1668	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1669	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1670	&xorps	($inout1,&QWP(16*1,"esp"));
1671	&xorps	($inout2,&QWP(16*2,"esp"));
1672	&movups	(&QWP(16*0,$out),$inout0);	# write output
1673	&xorps	($inout3,&QWP(16*3,"esp"));
1674	&movups	(&QWP(16*1,$out),$inout1);
1675	&xorps	($inout4,$tweak);
1676	&movups	(&QWP(16*2,$out),$inout2);
1677	&movups	(&QWP(16*3,$out),$inout3);
1678	&movups	(&QWP(16*4,$out),$inout4);
1679	&lea	($out,&DWP(16*5,$out));
1680	&jmp	(&label("xts_dec_done"));
1681
1682&set_label("xts_dec_one",16);
1683	&movups	($inout0,&QWP(16*0,$inp));	# load input
1684	&lea	($inp,&DWP(16*1,$inp));
1685	&xorps	($inout0,$inout3);		# input^=tweak
1686	if ($inline)
1687	{   &aesni_inline_generate1("dec");	}
1688	else
1689	{   &call	("_aesni_decrypt1");	}
1690	&xorps	($inout0,$inout3);		# output^=tweak
1691	&movups	(&QWP(16*0,$out),$inout0);	# write output
1692	&lea	($out,&DWP(16*1,$out));
1693
1694	&movdqa	($tweak,$inout3);		# last tweak
1695	&jmp	(&label("xts_dec_done"));
1696
1697&set_label("xts_dec_two",16);
1698	&movaps	($inout4,$tweak);		# put aside last tweak
1699
1700	&movups	($inout0,&QWP(16*0,$inp));	# load input
1701	&movups	($inout1,&QWP(16*1,$inp));
1702	&lea	($inp,&DWP(16*2,$inp));
1703	&xorps	($inout0,$inout3);		# input^=tweak
1704	&xorps	($inout1,$inout4);
1705
1706	&call	("_aesni_decrypt2");
1707
1708	&xorps	($inout0,$inout3);		# output^=tweak
1709	&xorps	($inout1,$inout4);
1710	&movups	(&QWP(16*0,$out),$inout0);	# write output
1711	&movups	(&QWP(16*1,$out),$inout1);
1712	&lea	($out,&DWP(16*2,$out));
1713
1714	&movdqa	($tweak,$inout4);		# last tweak
1715	&jmp	(&label("xts_dec_done"));
1716
1717&set_label("xts_dec_three",16);
1718	&movaps	($inout5,$tweak);		# put aside last tweak
1719	&movups	($inout0,&QWP(16*0,$inp));	# load input
1720	&movups	($inout1,&QWP(16*1,$inp));
1721	&movups	($inout2,&QWP(16*2,$inp));
1722	&lea	($inp,&DWP(16*3,$inp));
1723	&xorps	($inout0,$inout3);		# input^=tweak
1724	&xorps	($inout1,$inout4);
1725	&xorps	($inout2,$inout5);
1726
1727	&call	("_aesni_decrypt3");
1728
1729	&xorps	($inout0,$inout3);		# output^=tweak
1730	&xorps	($inout1,$inout4);
1731	&xorps	($inout2,$inout5);
1732	&movups	(&QWP(16*0,$out),$inout0);	# write output
1733	&movups	(&QWP(16*1,$out),$inout1);
1734	&movups	(&QWP(16*2,$out),$inout2);
1735	&lea	($out,&DWP(16*3,$out));
1736
1737	&movdqa	($tweak,$inout5);		# last tweak
1738	&jmp	(&label("xts_dec_done"));
1739
1740&set_label("xts_dec_four",16);
1741	&movaps	($inout4,$tweak);		# put aside last tweak
1742
1743	&movups	($inout0,&QWP(16*0,$inp));	# load input
1744	&movups	($inout1,&QWP(16*1,$inp));
1745	&movups	($inout2,&QWP(16*2,$inp));
1746	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1747	&movups	($inout3,&QWP(16*3,$inp));
1748	&lea	($inp,&DWP(16*4,$inp));
1749	&xorps	($inout1,&QWP(16*1,"esp"));
1750	&xorps	($inout2,$inout5);
1751	&xorps	($inout3,$inout4);
1752
1753	&call	("_aesni_decrypt4");
1754
1755	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1756	&xorps	($inout1,&QWP(16*1,"esp"));
1757	&xorps	($inout2,$inout5);
1758	&movups	(&QWP(16*0,$out),$inout0);	# write output
1759	&xorps	($inout3,$inout4);
1760	&movups	(&QWP(16*1,$out),$inout1);
1761	&movups	(&QWP(16*2,$out),$inout2);
1762	&movups	(&QWP(16*3,$out),$inout3);
1763	&lea	($out,&DWP(16*4,$out));
1764
1765	&movdqa	($tweak,$inout4);		# last tweak
1766	&jmp	(&label("xts_dec_done"));
1767
1768&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
1769	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1770	&and	($len,15);
1771	&jz	(&label("xts_dec_ret"));
1772	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1773	&jmp	(&label("xts_dec_only_one_more"));
1774
1775&set_label("xts_dec_done",16);
1776	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1777	&pxor	($twtmp,$twtmp);
1778	&and	($len,15);
1779	&jz	(&label("xts_dec_ret"));
1780
1781	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1782	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1783	&pshufd	($twres,$twtmp,0x13);
1784	&pxor	($twtmp,$twtmp);
1785	&movdqa	($twmask,&QWP(16*6,"esp"));
1786	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1787	&pand	($twres,$twmask);		# isolate carry and residue
1788	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1789	&pxor	($tweak,$twres);
1790
1791&set_label("xts_dec_only_one_more");
1792	&pshufd	($inout3,$twtmp,0x13);
1793	&movdqa	($inout4,$tweak);		# put aside previous tweak
1794	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1795	&pand	($inout3,$twmask);		# isolate carry and residue
1796	&pxor	($inout3,$tweak);
1797
1798	&mov	($key,$key_);			# restore $key
1799	&mov	($rounds,$rounds_);		# restore $rounds
1800
1801	&movups	($inout0,&QWP(0,$inp));		# load input
1802	&xorps	($inout0,$inout3);		# input^=tweak
1803	if ($inline)
1804	{   &aesni_inline_generate1("dec");	}
1805	else
1806	{   &call	("_aesni_decrypt1");	}
1807	&xorps	($inout0,$inout3);		# output^=tweak
1808	&movups	(&QWP(0,$out),$inout0);		# write output
1809
1810&set_label("xts_dec_steal");
1811	&movz	($rounds,&BP(16,$inp));
1812	&movz	($key,&BP(0,$out));
1813	&lea	($inp,&DWP(1,$inp));
1814	&mov	(&BP(0,$out),&LB($rounds));
1815	&mov	(&BP(16,$out),&LB($key));
1816	&lea	($out,&DWP(1,$out));
1817	&sub	($len,1);
1818	&jnz	(&label("xts_dec_steal"));
1819
1820	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1821	&mov	($key,$key_);			# restore $key
1822	&mov	($rounds,$rounds_);		# restore $rounds
1823
1824	&movups	($inout0,&QWP(0,$out));		# load input
1825	&xorps	($inout0,$inout4);		# input^=tweak
1826	if ($inline)
1827	{   &aesni_inline_generate1("dec");	}
1828	else
1829	{   &call	("_aesni_decrypt1");	}
1830	&xorps	($inout0,$inout4);		# output^=tweak
1831	&movups	(&QWP(0,$out),$inout0);		# write output
1832
1833&set_label("xts_dec_ret");
1834	&pxor	("xmm0","xmm0");		# clear register bank
1835	&pxor	("xmm1","xmm1");
1836	&pxor	("xmm2","xmm2");
1837	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
1838	&pxor	("xmm3","xmm3");
1839	&movdqa	(&QWP(16*1,"esp"),"xmm0");
1840	&pxor	("xmm4","xmm4");
1841	&movdqa	(&QWP(16*2,"esp"),"xmm0");
1842	&pxor	("xmm5","xmm5");
1843	&movdqa	(&QWP(16*3,"esp"),"xmm0");
1844	&pxor	("xmm6","xmm6");
1845	&movdqa	(&QWP(16*4,"esp"),"xmm0");
1846	&pxor	("xmm7","xmm7");
1847	&movdqa	(&QWP(16*5,"esp"),"xmm0");
1848	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1849&function_end("aesni_xts_decrypt");
1850}
1851
1852######################################################################
1853# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
1854#	const AES_KEY *key, unsigned int start_block_num,
1855#	unsigned char offset_i[16], const unsigned char L_[][16],
1856#	unsigned char checksum[16]);
1857#
1858{
1859# offsets within stack frame
1860my $checksum = 16*6;
1861my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
1862
1863# reassigned registers
1864my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
1865# $l_, $blocks, $inp, $key are permanently allocated in registers;
1866# remaining non-volatile ones are offloaded to stack, which even
1867# stay invariant after written to stack.
1868
1869&function_begin("aesni_ocb_encrypt");
1870	&mov	($rounds,&wparam(5));		# &offset_i
1871	&mov	($rounds_,&wparam(7));		# &checksum
1872
1873	&mov	($inp,&wparam(0));
1874	&mov	($out,&wparam(1));
1875	&mov	($len,&wparam(2));
1876	&mov	($key,&wparam(3));
1877	&movdqu	($rndkey0,&QWP(0,$rounds));	# load offset_i
1878	&mov	($block,&wparam(4));		# start_block_num
1879	&movdqu	($rndkey1,&QWP(0,$rounds_));	# load checksum
1880	&mov	($l_,&wparam(6));		# L_
1881
1882	&mov	($rounds,"esp");
1883	&sub	("esp",$esp_off+4);		# alloca
1884	&and	("esp",-16);			# align stack
1885
1886	&sub	($out,$inp);
1887	&shl	($len,4);
1888	&lea	($len,&DWP(-16*6,$inp,$len));	# end of input - 16*6
1889	&mov	(&DWP($out_off,"esp"),$out);
1890	&mov	(&DWP($end_off,"esp"),$len);
1891	&mov	(&DWP($esp_off,"esp"),$rounds);
1892
1893	&mov	($rounds,&DWP(240,$key));
1894
1895	&test	($block,1);
1896	&jnz	(&label("odd"));
1897
1898	&bsf		($i3,$block);
1899	&add		($block,1);
1900	&shl		($i3,4);
1901	&movdqu		($inout5,&QWP(0,$l_,$i3));
1902	&mov		($i3,$key);			# put aside key
1903
1904	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
1905	&lea		($inp,&DWP(16,$inp));
1906
1907	&pxor		($inout5,$rndkey0);		# ^ last offset_i
1908	&pxor		($rndkey1,$inout0);		# checksum
1909	&pxor		($inout0,$inout5);		# ^ offset_i
1910
1911	&movdqa		($inout4,$rndkey1);
1912	if ($inline)
1913	{   &aesni_inline_generate1("enc");	}
1914	else
1915	{   &call	("_aesni_encrypt1");	}
1916
1917	&xorps		($inout0,$inout5);		# ^ offset_i
1918	&movdqa		($rndkey0,$inout5);		# pass last offset_i
1919	&movdqa		($rndkey1,$inout4);		# pass the checksum
1920
1921	&movups		(&QWP(-16,$out,$inp),$inout0);	# store output
1922
1923	&mov		($rounds,&DWP(240,$i3));
1924	&mov		($key,$i3);			# restore key
1925	&mov		($len,&DWP($end_off,"esp"));
1926
1927&set_label("odd");
1928	&shl		($rounds,4);
1929	&mov		($out,16);
1930	&sub		($out,$rounds);			# twisted rounds
1931	&mov		(&DWP($key_off,"esp"),$key);
1932	&lea		($key,&DWP(32,$key,$rounds));	# end of key schedule
1933	&mov		(&DWP($rounds_off,"esp"),$out);
1934
1935	&cmp		($inp,$len);
1936	&ja		(&label("short"));
1937	&jmp		(&label("grandloop"));
1938
1939&set_label("grandloop",32);
1940	&lea		($i1,&DWP(1,$block));
1941	&lea		($i3,&DWP(3,$block));
1942	&lea		($i5,&DWP(5,$block));
1943	&add		($block,6);
1944	&bsf		($i1,$i1);
1945	&bsf		($i3,$i3);
1946	&bsf		($i5,$i5);
1947	&shl		($i1,4);
1948	&shl		($i3,4);
1949	&shl		($i5,4);
1950	&movdqu		($inout0,&QWP(0,$l_));
1951	&movdqu		($inout1,&QWP(0,$l_,$i1));
1952	&mov		($rounds,&DWP($rounds_off,"esp"));
1953	&movdqa		($inout2,$inout0);
1954	&movdqu		($inout3,&QWP(0,$l_,$i3));
1955	&movdqa		($inout4,$inout0);
1956	&movdqu		($inout5,&QWP(0,$l_,$i5));
1957
1958	&pxor		($inout0,$rndkey0);		# ^ last offset_i
1959	&pxor		($inout1,$inout0);
1960	&movdqa		(&QWP(16*0,"esp"),$inout0);
1961	&pxor		($inout2,$inout1);
1962	&movdqa		(&QWP(16*1,"esp"),$inout1);
1963	&pxor		($inout3,$inout2);
1964	&movdqa		(&QWP(16*2,"esp"),$inout2);
1965	&pxor		($inout4,$inout3);
1966	&movdqa		(&QWP(16*3,"esp"),$inout3);
1967	&pxor		($inout5,$inout4);
1968	&movdqa		(&QWP(16*4,"esp"),$inout4);
1969	&movdqa		(&QWP(16*5,"esp"),$inout5);
1970
1971	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
1972	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
1973	&movdqu		($inout1,&QWP(16*1,$inp));
1974	&movdqu		($inout2,&QWP(16*2,$inp));
1975	&movdqu		($inout3,&QWP(16*3,$inp));
1976	&movdqu		($inout4,&QWP(16*4,$inp));
1977	&movdqu		($inout5,&QWP(16*5,$inp));
1978	&lea		($inp,&DWP(16*6,$inp));
1979
1980	&pxor		($rndkey1,$inout0);		# checksum
1981	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
1982	&pxor		($rndkey1,$inout1);
1983	&pxor		($inout1,$rndkey0);
1984	&pxor		($rndkey1,$inout2);
1985	&pxor		($inout2,$rndkey0);
1986	&pxor		($rndkey1,$inout3);
1987	&pxor		($inout3,$rndkey0);
1988	&pxor		($rndkey1,$inout4);
1989	&pxor		($inout4,$rndkey0);
1990	&pxor		($rndkey1,$inout5);
1991	&pxor		($inout5,$rndkey0);
1992	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
1993
1994	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
1995	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
1996	&pxor		($inout1,&QWP(16*1,"esp"));
1997	&pxor		($inout2,&QWP(16*2,"esp"));
1998	&pxor		($inout3,&QWP(16*3,"esp"));
1999	&pxor		($inout4,&QWP(16*4,"esp"));
2000	&pxor		($inout5,&QWP(16*5,"esp"));
2001
2002	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2003	&aesenc		($inout0,$rndkey1);
2004	&aesenc		($inout1,$rndkey1);
2005	&aesenc		($inout2,$rndkey1);
2006	&aesenc		($inout3,$rndkey1);
2007	&aesenc		($inout4,$rndkey1);
2008	&aesenc		($inout5,$rndkey1);
2009
2010	&mov		($out,&DWP($out_off,"esp"));
2011	&mov		($len,&DWP($end_off,"esp"));
2012	&call		("_aesni_encrypt6_enter");
2013
2014	&movdqa		($rndkey0,&QWP(16*5,"esp"));	# pass last offset_i
2015	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2016	&pxor		($inout1,&QWP(16*1,"esp"));
2017	&pxor		($inout2,&QWP(16*2,"esp"));
2018	&pxor		($inout3,&QWP(16*3,"esp"));
2019	&pxor		($inout4,&QWP(16*4,"esp"));
2020	&pxor		($inout5,$rndkey0);
2021	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2022
2023	&movdqu		(&QWP(-16*6,$out,$inp),$inout0);# store output
2024	&movdqu		(&QWP(-16*5,$out,$inp),$inout1);
2025	&movdqu		(&QWP(-16*4,$out,$inp),$inout2);
2026	&movdqu		(&QWP(-16*3,$out,$inp),$inout3);
2027	&movdqu		(&QWP(-16*2,$out,$inp),$inout4);
2028	&movdqu		(&QWP(-16*1,$out,$inp),$inout5);
2029	&cmp		($inp,$len);			# done yet?
2030	&jbe		(&label("grandloop"));
2031
2032&set_label("short");
2033	&add		($len,16*6);
2034	&sub		($len,$inp);
2035	&jz		(&label("done"));
2036
2037	&cmp		($len,16*2);
2038	&jb		(&label("one"));
2039	&je		(&label("two"));
2040
2041	&cmp		($len,16*4);
2042	&jb		(&label("three"));
2043	&je		(&label("four"));
2044
2045	&lea		($i1,&DWP(1,$block));
2046	&lea		($i3,&DWP(3,$block));
2047	&bsf		($i1,$i1);
2048	&bsf		($i3,$i3);
2049	&shl		($i1,4);
2050	&shl		($i3,4);
2051	&movdqu		($inout0,&QWP(0,$l_));
2052	&movdqu		($inout1,&QWP(0,$l_,$i1));
2053	&mov		($rounds,&DWP($rounds_off,"esp"));
2054	&movdqa		($inout2,$inout0);
2055	&movdqu		($inout3,&QWP(0,$l_,$i3));
2056	&movdqa		($inout4,$inout0);
2057
2058	&pxor		($inout0,$rndkey0);		# ^ last offset_i
2059	&pxor		($inout1,$inout0);
2060	&movdqa		(&QWP(16*0,"esp"),$inout0);
2061	&pxor		($inout2,$inout1);
2062	&movdqa		(&QWP(16*1,"esp"),$inout1);
2063	&pxor		($inout3,$inout2);
2064	&movdqa		(&QWP(16*2,"esp"),$inout2);
2065	&pxor		($inout4,$inout3);
2066	&movdqa		(&QWP(16*3,"esp"),$inout3);
2067	&pxor		($inout5,$inout4);
2068	&movdqa		(&QWP(16*4,"esp"),$inout4);
2069
2070	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
2071	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2072	&movdqu		($inout1,&QWP(16*1,$inp));
2073	&movdqu		($inout2,&QWP(16*2,$inp));
2074	&movdqu		($inout3,&QWP(16*3,$inp));
2075	&movdqu		($inout4,&QWP(16*4,$inp));
2076	&pxor		($inout5,$inout5);
2077
2078	&pxor		($rndkey1,$inout0);		# checksum
2079	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
2080	&pxor		($rndkey1,$inout1);
2081	&pxor		($inout1,$rndkey0);
2082	&pxor		($rndkey1,$inout2);
2083	&pxor		($inout2,$rndkey0);
2084	&pxor		($rndkey1,$inout3);
2085	&pxor		($inout3,$rndkey0);
2086	&pxor		($rndkey1,$inout4);
2087	&pxor		($inout4,$rndkey0);
2088	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2089
2090	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
2091	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2092	&pxor		($inout1,&QWP(16*1,"esp"));
2093	&pxor		($inout2,&QWP(16*2,"esp"));
2094	&pxor		($inout3,&QWP(16*3,"esp"));
2095	&pxor		($inout4,&QWP(16*4,"esp"));
2096
2097	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2098	&aesenc		($inout0,$rndkey1);
2099	&aesenc		($inout1,$rndkey1);
2100	&aesenc		($inout2,$rndkey1);
2101	&aesenc		($inout3,$rndkey1);
2102	&aesenc		($inout4,$rndkey1);
2103	&aesenc		($inout5,$rndkey1);
2104
2105	&mov		($out,&DWP($out_off,"esp"));
2106	&call		("_aesni_encrypt6_enter");
2107
2108	&movdqa		($rndkey0,&QWP(16*4,"esp"));	# pass last offset_i
2109	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2110	&pxor		($inout1,&QWP(16*1,"esp"));
2111	&pxor		($inout2,&QWP(16*2,"esp"));
2112	&pxor		($inout3,&QWP(16*3,"esp"));
2113	&pxor		($inout4,$rndkey0);
2114	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2115
2116	&movdqu		(&QWP(16*0,$out,$inp),$inout0);	# store output
2117	&movdqu		(&QWP(16*1,$out,$inp),$inout1);
2118	&movdqu		(&QWP(16*2,$out,$inp),$inout2);
2119	&movdqu		(&QWP(16*3,$out,$inp),$inout3);
2120	&movdqu		(&QWP(16*4,$out,$inp),$inout4);
2121
2122	&jmp		(&label("done"));
2123
2124&set_label("one",16);
2125	&movdqu		($inout5,&QWP(0,$l_));
2126	&mov		($key,&DWP($key_off,"esp"));	# restore key
2127
2128	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2129	&mov		($rounds,&DWP(240,$key));
2130
2131	&pxor		($inout5,$rndkey0);		# ^ last offset_i
2132	&pxor		($rndkey1,$inout0);		# checksum
2133	&pxor		($inout0,$inout5);		# ^ offset_i
2134
2135	&movdqa		($inout4,$rndkey1);
2136	&mov		($out,&DWP($out_off,"esp"));
2137	if ($inline)
2138	{   &aesni_inline_generate1("enc");	}
2139	else
2140	{   &call	("_aesni_encrypt1");	}
2141
2142	&xorps		($inout0,$inout5);		# ^ offset_i
2143	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2144	&movdqa		($rndkey1,$inout4);		# pass the checksum
2145	&movups		(&QWP(0,$out,$inp),$inout0);
2146
2147	&jmp		(&label("done"));
2148
2149&set_label("two",16);
2150	&lea		($i1,&DWP(1,$block));
2151	&mov		($key,&DWP($key_off,"esp"));	# restore key
2152	&bsf		($i1,$i1);
2153	&shl		($i1,4);
2154	&movdqu		($inout4,&QWP(0,$l_));
2155	&movdqu		($inout5,&QWP(0,$l_,$i1));
2156
2157	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2158	&movdqu		($inout1,&QWP(16*1,$inp));
2159	&mov		($rounds,&DWP(240,$key));
2160
2161	&pxor		($inout4,$rndkey0);		# ^ last offset_i
2162	&pxor		($inout5,$inout4);
2163
2164	&pxor		($rndkey1,$inout0);		# checksum
2165	&pxor		($inout0,$inout4);		# ^ offset_i
2166	&pxor		($rndkey1,$inout1);
2167	&pxor		($inout1,$inout5);
2168
2169	&movdqa		($inout3,$rndkey1)
2170	&mov		($out,&DWP($out_off,"esp"));
2171	&call		("_aesni_encrypt2");
2172
2173	&xorps		($inout0,$inout4);		# ^ offset_i
2174	&xorps		($inout1,$inout5);
2175	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2176	&movdqa		($rndkey1,$inout3);		# pass the checksum
2177	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2178	&movups		(&QWP(16*1,$out,$inp),$inout1);
2179
2180	&jmp		(&label("done"));
2181
2182&set_label("three",16);
2183	&lea		($i1,&DWP(1,$block));
2184	&mov		($key,&DWP($key_off,"esp"));	# restore key
2185	&bsf		($i1,$i1);
2186	&shl		($i1,4);
2187	&movdqu		($inout3,&QWP(0,$l_));
2188	&movdqu		($inout4,&QWP(0,$l_,$i1));
2189	&movdqa		($inout5,$inout3);
2190
2191	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2192	&movdqu		($inout1,&QWP(16*1,$inp));
2193	&movdqu		($inout2,&QWP(16*2,$inp));
2194	&mov		($rounds,&DWP(240,$key));
2195
2196	&pxor		($inout3,$rndkey0);		# ^ last offset_i
2197	&pxor		($inout4,$inout3);
2198	&pxor		($inout5,$inout4);
2199
2200	&pxor		($rndkey1,$inout0);		# checksum
2201	&pxor		($inout0,$inout3);		# ^ offset_i
2202	&pxor		($rndkey1,$inout1);
2203	&pxor		($inout1,$inout4);
2204	&pxor		($rndkey1,$inout2);
2205	&pxor		($inout2,$inout5);
2206
2207	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2208	&mov		($out,&DWP($out_off,"esp"));
2209	&call		("_aesni_encrypt3");
2210
2211	&xorps		($inout0,$inout3);		# ^ offset_i
2212	&xorps		($inout1,$inout4);
2213	&xorps		($inout2,$inout5);
2214	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2215	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2216	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2217	&movups		(&QWP(16*1,$out,$inp),$inout1);
2218	&movups		(&QWP(16*2,$out,$inp),$inout2);
2219
2220	&jmp		(&label("done"));
2221
2222&set_label("four",16);
2223	&lea		($i1,&DWP(1,$block));
2224	&lea		($i3,&DWP(3,$block));
2225	&bsf		($i1,$i1);
2226	&bsf		($i3,$i3);
2227	&mov		($key,&DWP($key_off,"esp"));	# restore key
2228	&shl		($i1,4);
2229	&shl		($i3,4);
2230	&movdqu		($inout2,&QWP(0,$l_));
2231	&movdqu		($inout3,&QWP(0,$l_,$i1));
2232	&movdqa		($inout4,$inout2);
2233	&movdqu		($inout5,&QWP(0,$l_,$i3));
2234
2235	&pxor		($inout2,$rndkey0);		# ^ last offset_i
2236	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2237	&pxor		($inout3,$inout2);
2238	&movdqu		($inout1,&QWP(16*1,$inp));
2239	&pxor		($inout4,$inout3);
2240	&movdqa		(&QWP(16*0,"esp"),$inout2);
2241	&pxor		($inout5,$inout4);
2242	&movdqa		(&QWP(16*1,"esp"),$inout3);
2243	&movdqu		($inout2,&QWP(16*2,$inp));
2244	&movdqu		($inout3,&QWP(16*3,$inp));
2245	&mov		($rounds,&DWP(240,$key));
2246
2247	&pxor		($rndkey1,$inout0);		# checksum
2248	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2249	&pxor		($rndkey1,$inout1);
2250	&pxor		($inout1,&QWP(16*1,"esp"));
2251	&pxor		($rndkey1,$inout2);
2252	&pxor		($inout2,$inout4);
2253	&pxor		($rndkey1,$inout3);
2254	&pxor		($inout3,$inout5);
2255
2256	&movdqa		(&QWP($checksum,"esp"),$rndkey1)
2257	&mov		($out,&DWP($out_off,"esp"));
2258	&call		("_aesni_encrypt4");
2259
2260	&xorps		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2261	&xorps		($inout1,&QWP(16*1,"esp"));
2262	&xorps		($inout2,$inout4);
2263	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2264	&xorps		($inout3,$inout5);
2265	&movups		(&QWP(16*1,$out,$inp),$inout1);
2266	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2267	&movups		(&QWP(16*2,$out,$inp),$inout2);
2268	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2269	&movups		(&QWP(16*3,$out,$inp),$inout3);
2270
2271&set_label("done");
2272	&mov	($key,&DWP($esp_off,"esp"));
2273	&pxor	($inout0,$inout0);		# clear register bank
2274	&pxor	($inout1,$inout1);
2275	&movdqa	(&QWP(16*0,"esp"),$inout0);	# clear stack
2276	&pxor	($inout2,$inout2);
2277	&movdqa	(&QWP(16*1,"esp"),$inout0);
2278	&pxor	($inout3,$inout3);
2279	&movdqa	(&QWP(16*2,"esp"),$inout0);
2280	&pxor	($inout4,$inout4);
2281	&movdqa	(&QWP(16*3,"esp"),$inout0);
2282	&pxor	($inout5,$inout5);
2283	&movdqa	(&QWP(16*4,"esp"),$inout0);
2284	&movdqa	(&QWP(16*5,"esp"),$inout0);
2285	&movdqa	(&QWP(16*6,"esp"),$inout0);
2286
2287	&lea	("esp",&DWP(0,$key));
2288	&mov	($rounds,&wparam(5));		# &offset_i
2289	&mov	($rounds_,&wparam(7));		# &checksum
2290	&movdqu	(&QWP(0,$rounds),$rndkey0);
2291	&pxor	($rndkey0,$rndkey0);
2292	&movdqu	(&QWP(0,$rounds_),$rndkey1);
2293	&pxor	($rndkey1,$rndkey1);
2294&function_end("aesni_ocb_encrypt");
2295
2296&function_begin("aesni_ocb_decrypt");
2297	&mov	($rounds,&wparam(5));		# &offset_i
2298	&mov	($rounds_,&wparam(7));		# &checksum
2299
2300	&mov	($inp,&wparam(0));
2301	&mov	($out,&wparam(1));
2302	&mov	($len,&wparam(2));
2303	&mov	($key,&wparam(3));
2304	&movdqu	($rndkey0,&QWP(0,$rounds));	# load offset_i
2305	&mov	($block,&wparam(4));		# start_block_num
2306	&movdqu	($rndkey1,&QWP(0,$rounds_));	# load checksum
2307	&mov	($l_,&wparam(6));		# L_
2308
2309	&mov	($rounds,"esp");
2310	&sub	("esp",$esp_off+4);		# alloca
2311	&and	("esp",-16);			# align stack
2312
2313	&sub	($out,$inp);
2314	&shl	($len,4);
2315	&lea	($len,&DWP(-16*6,$inp,$len));	# end of input - 16*6
2316	&mov	(&DWP($out_off,"esp"),$out);
2317	&mov	(&DWP($end_off,"esp"),$len);
2318	&mov	(&DWP($esp_off,"esp"),$rounds);
2319
2320	&mov	($rounds,&DWP(240,$key));
2321
2322	&test	($block,1);
2323	&jnz	(&label("odd"));
2324
2325	&bsf		($i3,$block);
2326	&add		($block,1);
2327	&shl		($i3,4);
2328	&movdqu		($inout5,&QWP(0,$l_,$i3));
2329	&mov		($i3,$key);			# put aside key
2330
2331	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2332	&lea		($inp,&DWP(16,$inp));
2333
2334	&pxor		($inout5,$rndkey0);		# ^ last offset_i
2335	&pxor		($inout0,$inout5);		# ^ offset_i
2336
2337	&movdqa		($inout4,$rndkey1);
2338	if ($inline)
2339	{   &aesni_inline_generate1("dec");	}
2340	else
2341	{   &call	("_aesni_decrypt1");	}
2342
2343	&xorps		($inout0,$inout5);		# ^ offset_i
2344	&movaps		($rndkey1,$inout4);		# pass the checksum
2345	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2346	&xorps		($rndkey1,$inout0);		# checksum
2347	&movups		(&QWP(-16,$out,$inp),$inout0);	# store output
2348
2349	&mov		($rounds,&DWP(240,$i3));
2350	&mov		($key,$i3);			# restore key
2351	&mov		($len,&DWP($end_off,"esp"));
2352
2353&set_label("odd");
2354	&shl		($rounds,4);
2355	&mov		($out,16);
2356	&sub		($out,$rounds);			# twisted rounds
2357	&mov		(&DWP($key_off,"esp"),$key);
2358	&lea		($key,&DWP(32,$key,$rounds));	# end of key schedule
2359	&mov		(&DWP($rounds_off,"esp"),$out);
2360
2361	&cmp		($inp,$len);
2362	&ja		(&label("short"));
2363	&jmp		(&label("grandloop"));
2364
2365&set_label("grandloop",32);
2366	&lea		($i1,&DWP(1,$block));
2367	&lea		($i3,&DWP(3,$block));
2368	&lea		($i5,&DWP(5,$block));
2369	&add		($block,6);
2370	&bsf		($i1,$i1);
2371	&bsf		($i3,$i3);
2372	&bsf		($i5,$i5);
2373	&shl		($i1,4);
2374	&shl		($i3,4);
2375	&shl		($i5,4);
2376	&movdqu		($inout0,&QWP(0,$l_));
2377	&movdqu		($inout1,&QWP(0,$l_,$i1));
2378	&mov		($rounds,&DWP($rounds_off,"esp"));
2379	&movdqa		($inout2,$inout0);
2380	&movdqu		($inout3,&QWP(0,$l_,$i3));
2381	&movdqa		($inout4,$inout0);
2382	&movdqu		($inout5,&QWP(0,$l_,$i5));
2383
2384	&pxor		($inout0,$rndkey0);		# ^ last offset_i
2385	&pxor		($inout1,$inout0);
2386	&movdqa		(&QWP(16*0,"esp"),$inout0);
2387	&pxor		($inout2,$inout1);
2388	&movdqa		(&QWP(16*1,"esp"),$inout1);
2389	&pxor		($inout3,$inout2);
2390	&movdqa		(&QWP(16*2,"esp"),$inout2);
2391	&pxor		($inout4,$inout3);
2392	&movdqa		(&QWP(16*3,"esp"),$inout3);
2393	&pxor		($inout5,$inout4);
2394	&movdqa		(&QWP(16*4,"esp"),$inout4);
2395	&movdqa		(&QWP(16*5,"esp"),$inout5);
2396
2397	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
2398	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2399	&movdqu		($inout1,&QWP(16*1,$inp));
2400	&movdqu		($inout2,&QWP(16*2,$inp));
2401	&movdqu		($inout3,&QWP(16*3,$inp));
2402	&movdqu		($inout4,&QWP(16*4,$inp));
2403	&movdqu		($inout5,&QWP(16*5,$inp));
2404	&lea		($inp,&DWP(16*6,$inp));
2405
2406	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2407	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
2408	&pxor		($inout1,$rndkey0);
2409	&pxor		($inout2,$rndkey0);
2410	&pxor		($inout3,$rndkey0);
2411	&pxor		($inout4,$rndkey0);
2412	&pxor		($inout5,$rndkey0);
2413
2414	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
2415	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2416	&pxor		($inout1,&QWP(16*1,"esp"));
2417	&pxor		($inout2,&QWP(16*2,"esp"));
2418	&pxor		($inout3,&QWP(16*3,"esp"));
2419	&pxor		($inout4,&QWP(16*4,"esp"));
2420	&pxor		($inout5,&QWP(16*5,"esp"));
2421
2422	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2423	&aesdec		($inout0,$rndkey1);
2424	&aesdec		($inout1,$rndkey1);
2425	&aesdec		($inout2,$rndkey1);
2426	&aesdec		($inout3,$rndkey1);
2427	&aesdec		($inout4,$rndkey1);
2428	&aesdec		($inout5,$rndkey1);
2429
2430	&mov		($out,&DWP($out_off,"esp"));
2431	&mov		($len,&DWP($end_off,"esp"));
2432	&call		("_aesni_decrypt6_enter");
2433
2434	&movdqa		($rndkey0,&QWP(16*5,"esp"));	# pass last offset_i
2435	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2436	&movdqa		($rndkey1,&QWP($checksum,"esp"));
2437	&pxor		($inout1,&QWP(16*1,"esp"));
2438	&pxor		($inout2,&QWP(16*2,"esp"));
2439	&pxor		($inout3,&QWP(16*3,"esp"));
2440	&pxor		($inout4,&QWP(16*4,"esp"));
2441	&pxor		($inout5,$rndkey0);
2442
2443	&pxor		($rndkey1,$inout0);		# checksum
2444	&movdqu		(&QWP(-16*6,$out,$inp),$inout0);# store output
2445	&pxor		($rndkey1,$inout1);
2446	&movdqu		(&QWP(-16*5,$out,$inp),$inout1);
2447	&pxor		($rndkey1,$inout2);
2448	&movdqu		(&QWP(-16*4,$out,$inp),$inout2);
2449	&pxor		($rndkey1,$inout3);
2450	&movdqu		(&QWP(-16*3,$out,$inp),$inout3);
2451	&pxor		($rndkey1,$inout4);
2452	&movdqu		(&QWP(-16*2,$out,$inp),$inout4);
2453	&pxor		($rndkey1,$inout5);
2454	&movdqu		(&QWP(-16*1,$out,$inp),$inout5);
2455	&cmp		($inp,$len);			# done yet?
2456	&jbe		(&label("grandloop"));
2457
2458&set_label("short");
2459	&add		($len,16*6);
2460	&sub		($len,$inp);
2461	&jz		(&label("done"));
2462
2463	&cmp		($len,16*2);
2464	&jb		(&label("one"));
2465	&je		(&label("two"));
2466
2467	&cmp		($len,16*4);
2468	&jb		(&label("three"));
2469	&je		(&label("four"));
2470
2471	&lea		($i1,&DWP(1,$block));
2472	&lea		($i3,&DWP(3,$block));
2473	&bsf		($i1,$i1);
2474	&bsf		($i3,$i3);
2475	&shl		($i1,4);
2476	&shl		($i3,4);
2477	&movdqu		($inout0,&QWP(0,$l_));
2478	&movdqu		($inout1,&QWP(0,$l_,$i1));
2479	&mov		($rounds,&DWP($rounds_off,"esp"));
2480	&movdqa		($inout2,$inout0);
2481	&movdqu		($inout3,&QWP(0,$l_,$i3));
2482	&movdqa		($inout4,$inout0);
2483
2484	&pxor		($inout0,$rndkey0);		# ^ last offset_i
2485	&pxor		($inout1,$inout0);
2486	&movdqa		(&QWP(16*0,"esp"),$inout0);
2487	&pxor		($inout2,$inout1);
2488	&movdqa		(&QWP(16*1,"esp"),$inout1);
2489	&pxor		($inout3,$inout2);
2490	&movdqa		(&QWP(16*2,"esp"),$inout2);
2491	&pxor		($inout4,$inout3);
2492	&movdqa		(&QWP(16*3,"esp"),$inout3);
2493	&pxor		($inout5,$inout4);
2494	&movdqa		(&QWP(16*4,"esp"),$inout4);
2495
2496	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
2497	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2498	&movdqu		($inout1,&QWP(16*1,$inp));
2499	&movdqu		($inout2,&QWP(16*2,$inp));
2500	&movdqu		($inout3,&QWP(16*3,$inp));
2501	&movdqu		($inout4,&QWP(16*4,$inp));
2502	&pxor		($inout5,$inout5);
2503
2504	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2505	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
2506	&pxor		($inout1,$rndkey0);
2507	&pxor		($inout2,$rndkey0);
2508	&pxor		($inout3,$rndkey0);
2509	&pxor		($inout4,$rndkey0);
2510
2511	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
2512	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2513	&pxor		($inout1,&QWP(16*1,"esp"));
2514	&pxor		($inout2,&QWP(16*2,"esp"));
2515	&pxor		($inout3,&QWP(16*3,"esp"));
2516	&pxor		($inout4,&QWP(16*4,"esp"));
2517
2518	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2519	&aesdec		($inout0,$rndkey1);
2520	&aesdec		($inout1,$rndkey1);
2521	&aesdec		($inout2,$rndkey1);
2522	&aesdec		($inout3,$rndkey1);
2523	&aesdec		($inout4,$rndkey1);
2524	&aesdec		($inout5,$rndkey1);
2525
2526	&mov		($out,&DWP($out_off,"esp"));
2527	&call		("_aesni_decrypt6_enter");
2528
2529	&movdqa		($rndkey0,&QWP(16*4,"esp"));	# pass last offset_i
2530	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2531	&movdqa		($rndkey1,&QWP($checksum,"esp"));
2532	&pxor		($inout1,&QWP(16*1,"esp"));
2533	&pxor		($inout2,&QWP(16*2,"esp"));
2534	&pxor		($inout3,&QWP(16*3,"esp"));
2535	&pxor		($inout4,$rndkey0);
2536
2537	&pxor		($rndkey1,$inout0);		# checksum
2538	&movdqu		(&QWP(16*0,$out,$inp),$inout0);	# store output
2539	&pxor		($rndkey1,$inout1);
2540	&movdqu		(&QWP(16*1,$out,$inp),$inout1);
2541	&pxor		($rndkey1,$inout2);
2542	&movdqu		(&QWP(16*2,$out,$inp),$inout2);
2543	&pxor		($rndkey1,$inout3);
2544	&movdqu		(&QWP(16*3,$out,$inp),$inout3);
2545	&pxor		($rndkey1,$inout4);
2546	&movdqu		(&QWP(16*4,$out,$inp),$inout4);
2547
2548	&jmp		(&label("done"));
2549
2550&set_label("one",16);
2551	&movdqu		($inout5,&QWP(0,$l_));
2552	&mov		($key,&DWP($key_off,"esp"));	# restore key
2553
2554	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2555	&mov		($rounds,&DWP(240,$key));
2556
2557	&pxor		($inout5,$rndkey0);		# ^ last offset_i
2558	&pxor		($inout0,$inout5);		# ^ offset_i
2559
2560	&movdqa		($inout4,$rndkey1);
2561	&mov		($out,&DWP($out_off,"esp"));
2562	if ($inline)
2563	{   &aesni_inline_generate1("dec");	}
2564	else
2565	{   &call	("_aesni_decrypt1");	}
2566
2567	&xorps		($inout0,$inout5);		# ^ offset_i
2568	&movaps		($rndkey1,$inout4);		# pass the checksum
2569	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2570	&xorps		($rndkey1,$inout0);		# checksum
2571	&movups		(&QWP(0,$out,$inp),$inout0);
2572
2573	&jmp		(&label("done"));
2574
2575&set_label("two",16);
2576	&lea		($i1,&DWP(1,$block));
2577	&mov		($key,&DWP($key_off,"esp"));	# restore key
2578	&bsf		($i1,$i1);
2579	&shl		($i1,4);
2580	&movdqu		($inout4,&QWP(0,$l_));
2581	&movdqu		($inout5,&QWP(0,$l_,$i1));
2582
2583	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2584	&movdqu		($inout1,&QWP(16*1,$inp));
2585	&mov		($rounds,&DWP(240,$key));
2586
2587	&movdqa		($inout3,$rndkey1);
2588	&pxor		($inout4,$rndkey0);		# ^ last offset_i
2589	&pxor		($inout5,$inout4);
2590
2591	&pxor		($inout0,$inout4);		# ^ offset_i
2592	&pxor		($inout1,$inout5);
2593
2594	&mov		($out,&DWP($out_off,"esp"));
2595	&call		("_aesni_decrypt2");
2596
2597	&xorps		($inout0,$inout4);		# ^ offset_i
2598	&xorps		($inout1,$inout5);
2599	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2600	&xorps		($inout3,$inout0);		# checksum
2601	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2602	&xorps		($inout3,$inout1);
2603	&movups		(&QWP(16*1,$out,$inp),$inout1);
2604	&movaps		($rndkey1,$inout3);		# pass the checksum
2605
2606	&jmp		(&label("done"));
2607
2608&set_label("three",16);
2609	&lea		($i1,&DWP(1,$block));
2610	&mov		($key,&DWP($key_off,"esp"));	# restore key
2611	&bsf		($i1,$i1);
2612	&shl		($i1,4);
2613	&movdqu		($inout3,&QWP(0,$l_));
2614	&movdqu		($inout4,&QWP(0,$l_,$i1));
2615	&movdqa		($inout5,$inout3);
2616
2617	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2618	&movdqu		($inout1,&QWP(16*1,$inp));
2619	&movdqu		($inout2,&QWP(16*2,$inp));
2620	&mov		($rounds,&DWP(240,$key));
2621
2622	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2623	&pxor		($inout3,$rndkey0);		# ^ last offset_i
2624	&pxor		($inout4,$inout3);
2625	&pxor		($inout5,$inout4);
2626
2627	&pxor		($inout0,$inout3);		# ^ offset_i
2628	&pxor		($inout1,$inout4);
2629	&pxor		($inout2,$inout5);
2630
2631	&mov		($out,&DWP($out_off,"esp"));
2632	&call		("_aesni_decrypt3");
2633
2634	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2635	&xorps		($inout0,$inout3);		# ^ offset_i
2636	&xorps		($inout1,$inout4);
2637	&xorps		($inout2,$inout5);
2638	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2639	&pxor		($rndkey1,$inout0);		# checksum
2640	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2641	&movups		(&QWP(16*1,$out,$inp),$inout1);
2642	&pxor		($rndkey1,$inout1);
2643	&movups		(&QWP(16*2,$out,$inp),$inout2);
2644	&pxor		($rndkey1,$inout2);
2645
2646	&jmp		(&label("done"));
2647
2648&set_label("four",16);
2649	&lea		($i1,&DWP(1,$block));
2650	&lea		($i3,&DWP(3,$block));
2651	&bsf		($i1,$i1);
2652	&bsf		($i3,$i3);
2653	&mov		($key,&DWP($key_off,"esp"));	# restore key
2654	&shl		($i1,4);
2655	&shl		($i3,4);
2656	&movdqu		($inout2,&QWP(0,$l_));
2657	&movdqu		($inout3,&QWP(0,$l_,$i1));
2658	&movdqa		($inout4,$inout2);
2659	&movdqu		($inout5,&QWP(0,$l_,$i3));
2660
2661	&pxor		($inout2,$rndkey0);		# ^ last offset_i
2662	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2663	&pxor		($inout3,$inout2);
2664	&movdqu		($inout1,&QWP(16*1,$inp));
2665	&pxor		($inout4,$inout3);
2666	&movdqa		(&QWP(16*0,"esp"),$inout2);
2667	&pxor		($inout5,$inout4);
2668	&movdqa		(&QWP(16*1,"esp"),$inout3);
2669	&movdqu		($inout2,&QWP(16*2,$inp));
2670	&movdqu		($inout3,&QWP(16*3,$inp));
2671	&mov		($rounds,&DWP(240,$key));
2672
2673	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2674	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2675	&pxor		($inout1,&QWP(16*1,"esp"));
2676	&pxor		($inout2,$inout4);
2677	&pxor		($inout3,$inout5);
2678
2679	&mov		($out,&DWP($out_off,"esp"));
2680	&call		("_aesni_decrypt4");
2681
2682	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2683	&xorps		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2684	&xorps		($inout1,&QWP(16*1,"esp"));
2685	&xorps		($inout2,$inout4);
2686	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2687	&pxor		($rndkey1,$inout0);		# checksum
2688	&xorps		($inout3,$inout5);
2689	&movups		(&QWP(16*1,$out,$inp),$inout1);
2690	&pxor		($rndkey1,$inout1);
2691	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2692	&movups		(&QWP(16*2,$out,$inp),$inout2);
2693	&pxor		($rndkey1,$inout2);
2694	&movups		(&QWP(16*3,$out,$inp),$inout3);
2695	&pxor		($rndkey1,$inout3);
2696
2697&set_label("done");
2698	&mov	($key,&DWP($esp_off,"esp"));
2699	&pxor	($inout0,$inout0);		# clear register bank
2700	&pxor	($inout1,$inout1);
2701	&movdqa	(&QWP(16*0,"esp"),$inout0);	# clear stack
2702	&pxor	($inout2,$inout2);
2703	&movdqa	(&QWP(16*1,"esp"),$inout0);
2704	&pxor	($inout3,$inout3);
2705	&movdqa	(&QWP(16*2,"esp"),$inout0);
2706	&pxor	($inout4,$inout4);
2707	&movdqa	(&QWP(16*3,"esp"),$inout0);
2708	&pxor	($inout5,$inout5);
2709	&movdqa	(&QWP(16*4,"esp"),$inout0);
2710	&movdqa	(&QWP(16*5,"esp"),$inout0);
2711	&movdqa	(&QWP(16*6,"esp"),$inout0);
2712
2713	&lea	("esp",&DWP(0,$key));
2714	&mov	($rounds,&wparam(5));		# &offset_i
2715	&mov	($rounds_,&wparam(7));		# &checksum
2716	&movdqu	(&QWP(0,$rounds),$rndkey0);
2717	&pxor	($rndkey0,$rndkey0);
2718	&movdqu	(&QWP(0,$rounds_),$rndkey1);
2719	&pxor	($rndkey1,$rndkey1);
2720&function_end("aesni_ocb_decrypt");
2721}
2722}
2723
2724######################################################################
2725# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2726#                           size_t length, const AES_KEY *key,
2727#                           unsigned char *ivp,const int enc);
2728&function_begin("${PREFIX}_cbc_encrypt");
2729	&mov	($inp,&wparam(0));
2730	&mov	($rounds_,"esp");
2731	&mov	($out,&wparam(1));
2732	&sub	($rounds_,24);
2733	&mov	($len,&wparam(2));
2734	&and	($rounds_,-16);
2735	&mov	($key,&wparam(3));
2736	&mov	($key_,&wparam(4));
2737	&test	($len,$len);
2738	&jz	(&label("cbc_abort"));
2739
2740	&cmp	(&wparam(5),0);
2741	&xchg	($rounds_,"esp");		# alloca
2742	&movups	($ivec,&QWP(0,$key_));		# load IV
2743	&mov	($rounds,&DWP(240,$key));
2744	&mov	($key_,$key);			# backup $key
2745	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
2746	&mov	($rounds_,$rounds);		# backup $rounds
2747	&je	(&label("cbc_decrypt"));
2748
2749	&movaps	($inout0,$ivec);
2750	&cmp	($len,16);
2751	&jb	(&label("cbc_enc_tail"));
2752	&sub	($len,16);
2753	&jmp	(&label("cbc_enc_loop"));
2754
2755&set_label("cbc_enc_loop",16);
2756	&movups	($ivec,&QWP(0,$inp));		# input actually
2757	&lea	($inp,&DWP(16,$inp));
2758	if ($inline)
2759	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
2760	else
2761	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
2762	&mov	($rounds,$rounds_);	# restore $rounds
2763	&mov	($key,$key_);		# restore $key
2764	&movups	(&QWP(0,$out),$inout0);	# store output
2765	&lea	($out,&DWP(16,$out));
2766	&sub	($len,16);
2767	&jnc	(&label("cbc_enc_loop"));
2768	&add	($len,16);
2769	&jnz	(&label("cbc_enc_tail"));
2770	&movaps	($ivec,$inout0);
2771	&pxor	($inout0,$inout0);
2772	&jmp	(&label("cbc_ret"));
2773
2774&set_label("cbc_enc_tail");
2775	&mov	("ecx",$len);		# zaps $rounds
2776	&data_word(0xA4F3F689);		# rep movsb
2777	&mov	("ecx",16);		# zero tail
2778	&sub	("ecx",$len);
2779	&xor	("eax","eax");		# zaps $len
2780	&data_word(0xAAF3F689);		# rep stosb
2781	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
2782	&mov	($rounds,$rounds_);	# restore $rounds
2783	&mov	($inp,$out);		# $inp and $out are the same
2784	&mov	($key,$key_);		# restore $key
2785	&jmp	(&label("cbc_enc_loop"));
2786######################################################################
2787&set_label("cbc_decrypt",16);
2788	&cmp	($len,0x50);
2789	&jbe	(&label("cbc_dec_tail"));
2790	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
2791	&sub	($len,0x50);
2792	&jmp	(&label("cbc_dec_loop6_enter"));
2793
2794&set_label("cbc_dec_loop6",16);
2795	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
2796	&movups	(&QWP(0,$out),$inout5);
2797	&lea	($out,&DWP(0x10,$out));
2798&set_label("cbc_dec_loop6_enter");
2799	&movdqu	($inout0,&QWP(0,$inp));
2800	&movdqu	($inout1,&QWP(0x10,$inp));
2801	&movdqu	($inout2,&QWP(0x20,$inp));
2802	&movdqu	($inout3,&QWP(0x30,$inp));
2803	&movdqu	($inout4,&QWP(0x40,$inp));
2804	&movdqu	($inout5,&QWP(0x50,$inp));
2805
2806	&call	("_aesni_decrypt6");
2807
2808	&movups	($rndkey1,&QWP(0,$inp));
2809	&movups	($rndkey0,&QWP(0x10,$inp));
2810	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
2811	&xorps	($inout1,$rndkey1);
2812	&movups	($rndkey1,&QWP(0x20,$inp));
2813	&xorps	($inout2,$rndkey0);
2814	&movups	($rndkey0,&QWP(0x30,$inp));
2815	&xorps	($inout3,$rndkey1);
2816	&movups	($rndkey1,&QWP(0x40,$inp));
2817	&xorps	($inout4,$rndkey0);
2818	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
2819	&xorps	($inout5,$rndkey1);
2820	&movups	(&QWP(0,$out),$inout0);
2821	&movups	(&QWP(0x10,$out),$inout1);
2822	&lea	($inp,&DWP(0x60,$inp));
2823	&movups	(&QWP(0x20,$out),$inout2);
2824	&mov	($rounds,$rounds_);		# restore $rounds
2825	&movups	(&QWP(0x30,$out),$inout3);
2826	&mov	($key,$key_);			# restore $key
2827	&movups	(&QWP(0x40,$out),$inout4);
2828	&lea	($out,&DWP(0x50,$out));
2829	&sub	($len,0x60);
2830	&ja	(&label("cbc_dec_loop6"));
2831
2832	&movaps	($inout0,$inout5);
2833	&movaps	($ivec,$rndkey0);
2834	&add	($len,0x50);
2835	&jle	(&label("cbc_dec_clear_tail_collected"));
2836	&movups	(&QWP(0,$out),$inout0);
2837	&lea	($out,&DWP(0x10,$out));
2838&set_label("cbc_dec_tail");
2839	&movups	($inout0,&QWP(0,$inp));
2840	&movaps	($in0,$inout0);
2841	&cmp	($len,0x10);
2842	&jbe	(&label("cbc_dec_one"));
2843
2844	&movups	($inout1,&QWP(0x10,$inp));
2845	&movaps	($in1,$inout1);
2846	&cmp	($len,0x20);
2847	&jbe	(&label("cbc_dec_two"));
2848
2849	&movups	($inout2,&QWP(0x20,$inp));
2850	&cmp	($len,0x30);
2851	&jbe	(&label("cbc_dec_three"));
2852
2853	&movups	($inout3,&QWP(0x30,$inp));
2854	&cmp	($len,0x40);
2855	&jbe	(&label("cbc_dec_four"));
2856
2857	&movups	($inout4,&QWP(0x40,$inp));
2858	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
2859	&movups	($inout0,&QWP(0,$inp));
2860	&xorps	($inout5,$inout5);
2861	&call	("_aesni_decrypt6");
2862	&movups	($rndkey1,&QWP(0,$inp));
2863	&movups	($rndkey0,&QWP(0x10,$inp));
2864	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
2865	&xorps	($inout1,$rndkey1);
2866	&movups	($rndkey1,&QWP(0x20,$inp));
2867	&xorps	($inout2,$rndkey0);
2868	&movups	($rndkey0,&QWP(0x30,$inp));
2869	&xorps	($inout3,$rndkey1);
2870	&movups	($ivec,&QWP(0x40,$inp));	# IV
2871	&xorps	($inout4,$rndkey0);
2872	&movups	(&QWP(0,$out),$inout0);
2873	&movups	(&QWP(0x10,$out),$inout1);
2874	&pxor	($inout1,$inout1);
2875	&movups	(&QWP(0x20,$out),$inout2);
2876	&pxor	($inout2,$inout2);
2877	&movups	(&QWP(0x30,$out),$inout3);
2878	&pxor	($inout3,$inout3);
2879	&lea	($out,&DWP(0x40,$out));
2880	&movaps	($inout0,$inout4);
2881	&pxor	($inout4,$inout4);
2882	&sub	($len,0x50);
2883	&jmp	(&label("cbc_dec_tail_collected"));
2884
2885&set_label("cbc_dec_one",16);
2886	if ($inline)
2887	{   &aesni_inline_generate1("dec");	}
2888	else
2889	{   &call	("_aesni_decrypt1");	}
2890	&xorps	($inout0,$ivec);
2891	&movaps	($ivec,$in0);
2892	&sub	($len,0x10);
2893	&jmp	(&label("cbc_dec_tail_collected"));
2894
2895&set_label("cbc_dec_two",16);
2896	&call	("_aesni_decrypt2");
2897	&xorps	($inout0,$ivec);
2898	&xorps	($inout1,$in0);
2899	&movups	(&QWP(0,$out),$inout0);
2900	&movaps	($inout0,$inout1);
2901	&pxor	($inout1,$inout1);
2902	&lea	($out,&DWP(0x10,$out));
2903	&movaps	($ivec,$in1);
2904	&sub	($len,0x20);
2905	&jmp	(&label("cbc_dec_tail_collected"));
2906
2907&set_label("cbc_dec_three",16);
2908	&call	("_aesni_decrypt3");
2909	&xorps	($inout0,$ivec);
2910	&xorps	($inout1,$in0);
2911	&xorps	($inout2,$in1);
2912	&movups	(&QWP(0,$out),$inout0);
2913	&movaps	($inout0,$inout2);
2914	&pxor	($inout2,$inout2);
2915	&movups	(&QWP(0x10,$out),$inout1);
2916	&pxor	($inout1,$inout1);
2917	&lea	($out,&DWP(0x20,$out));
2918	&movups	($ivec,&QWP(0x20,$inp));
2919	&sub	($len,0x30);
2920	&jmp	(&label("cbc_dec_tail_collected"));
2921
2922&set_label("cbc_dec_four",16);
2923	&call	("_aesni_decrypt4");
2924	&movups	($rndkey1,&QWP(0x10,$inp));
2925	&movups	($rndkey0,&QWP(0x20,$inp));
2926	&xorps	($inout0,$ivec);
2927	&movups	($ivec,&QWP(0x30,$inp));
2928	&xorps	($inout1,$in0);
2929	&movups	(&QWP(0,$out),$inout0);
2930	&xorps	($inout2,$rndkey1);
2931	&movups	(&QWP(0x10,$out),$inout1);
2932	&pxor	($inout1,$inout1);
2933	&xorps	($inout3,$rndkey0);
2934	&movups	(&QWP(0x20,$out),$inout2);
2935	&pxor	($inout2,$inout2);
2936	&lea	($out,&DWP(0x30,$out));
2937	&movaps	($inout0,$inout3);
2938	&pxor	($inout3,$inout3);
2939	&sub	($len,0x40);
2940	&jmp	(&label("cbc_dec_tail_collected"));
2941
2942&set_label("cbc_dec_clear_tail_collected",16);
2943	&pxor	($inout1,$inout1);
2944	&pxor	($inout2,$inout2);
2945	&pxor	($inout3,$inout3);
2946	&pxor	($inout4,$inout4);
2947&set_label("cbc_dec_tail_collected");
2948	&and	($len,15);
2949	&jnz	(&label("cbc_dec_tail_partial"));
2950	&movups	(&QWP(0,$out),$inout0);
2951	&pxor	($rndkey0,$rndkey0);
2952	&jmp	(&label("cbc_ret"));
2953
2954&set_label("cbc_dec_tail_partial",16);
2955	&movaps	(&QWP(0,"esp"),$inout0);
2956	&pxor	($rndkey0,$rndkey0);
2957	&mov	("ecx",16);
2958	&mov	($inp,"esp");
2959	&sub	("ecx",$len);
2960	&data_word(0xA4F3F689);		# rep movsb
2961	&movdqa	(&QWP(0,"esp"),$inout0);
2962
2963&set_label("cbc_ret");
2964	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
2965	&mov	($key_,&wparam(4));
2966	&pxor	($inout0,$inout0);
2967	&pxor	($rndkey1,$rndkey1);
2968	&movups	(&QWP(0,$key_),$ivec);	# output IV
2969	&pxor	($ivec,$ivec);
2970&set_label("cbc_abort");
2971&function_end("${PREFIX}_cbc_encrypt");
2972
2973######################################################################
2974# Mechanical port from aesni-x86_64.pl.
2975#
2976# _aesni_set_encrypt_key is private interface,
2977# input:
2978#	"eax"	const unsigned char *userKey
2979#	$rounds	int bits
2980#	$key	AES_KEY *key
2981# output:
2982#	"eax"	return code
2983#	$round	rounds
2984
2985&function_begin_B("_aesni_set_encrypt_key");
2986	&push	("ebp");
2987	&push	("ebx");
2988	&test	("eax","eax");
2989	&jz	(&label("bad_pointer"));
2990	&test	($key,$key);
2991	&jz	(&label("bad_pointer"));
2992
2993	&call	(&label("pic"));
2994&set_label("pic");
2995	&blindpop("ebx");
2996	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2997
2998	&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
2999	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
3000	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
3001	&mov	("ebp",&DWP(4,"ebp"));
3002	&lea	($key,&DWP(16,$key));
3003	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
3004	&cmp	($rounds,256);
3005	&je	(&label("14rounds"));
3006	&cmp	($rounds,192);
3007	&je	(&label("12rounds"));
3008	&cmp	($rounds,128);
3009	&jne	(&label("bad_keybits"));
3010
3011&set_label("10rounds",16);
3012	&cmp		("ebp",1<<28);
3013	&je		(&label("10rounds_alt"));
3014
3015	&mov		($rounds,9);
3016	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
3017	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
3018	&call		(&label("key_128_cold"));
3019	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
3020	&call		(&label("key_128"));
3021	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
3022	&call		(&label("key_128"));
3023	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
3024	&call		(&label("key_128"));
3025	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
3026	&call		(&label("key_128"));
3027	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
3028	&call		(&label("key_128"));
3029	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
3030	&call		(&label("key_128"));
3031	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
3032	&call		(&label("key_128"));
3033	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
3034	&call		(&label("key_128"));
3035	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
3036	&call		(&label("key_128"));
3037	&$movekey	(&QWP(0,$key),"xmm0");
3038	&mov		(&DWP(80,$key),$rounds);
3039
3040	&jmp	(&label("good_key"));
3041
3042&set_label("key_128",16);
3043	&$movekey	(&QWP(0,$key),"xmm0");
3044	&lea		($key,&DWP(16,$key));
3045&set_label("key_128_cold");
3046	&shufps		("xmm4","xmm0",0b00010000);
3047	&xorps		("xmm0","xmm4");
3048	&shufps		("xmm4","xmm0",0b10001100);
3049	&xorps		("xmm0","xmm4");
3050	&shufps		("xmm1","xmm1",0b11111111);	# critical path
3051	&xorps		("xmm0","xmm1");
3052	&ret();
3053
3054&set_label("10rounds_alt",16);
3055	&movdqa		("xmm5",&QWP(0x00,"ebx"));
3056	&mov		($rounds,8);
3057	&movdqa		("xmm4",&QWP(0x20,"ebx"));
3058	&movdqa		("xmm2","xmm0");
3059	&movdqu		(&QWP(-16,$key),"xmm0");
3060
3061&set_label("loop_key128");
3062	&pshufb		("xmm0","xmm5");
3063	&aesenclast	("xmm0","xmm4");
3064	&pslld		("xmm4",1);
3065	&lea		($key,&DWP(16,$key));
3066
3067	&movdqa		("xmm3","xmm2");
3068	&pslldq		("xmm2",4);
3069	&pxor		("xmm3","xmm2");
3070	&pslldq		("xmm2",4);
3071	&pxor		("xmm3","xmm2");
3072	&pslldq		("xmm2",4);
3073	&pxor		("xmm2","xmm3");
3074
3075	&pxor		("xmm0","xmm2");
3076	&movdqu		(&QWP(-16,$key),"xmm0");
3077	&movdqa		("xmm2","xmm0");
3078
3079	&dec		($rounds);
3080	&jnz		(&label("loop_key128"));
3081
3082	&movdqa		("xmm4",&QWP(0x30,"ebx"));
3083
3084	&pshufb		("xmm0","xmm5");
3085	&aesenclast	("xmm0","xmm4");
3086	&pslld		("xmm4",1);
3087
3088	&movdqa		("xmm3","xmm2");
3089	&pslldq		("xmm2",4);
3090	&pxor		("xmm3","xmm2");
3091	&pslldq		("xmm2",4);
3092	&pxor		("xmm3","xmm2");
3093	&pslldq		("xmm2",4);
3094	&pxor		("xmm2","xmm3");
3095
3096	&pxor		("xmm0","xmm2");
3097	&movdqu		(&QWP(0,$key),"xmm0");
3098
3099	&movdqa		("xmm2","xmm0");
3100	&pshufb		("xmm0","xmm5");
3101	&aesenclast	("xmm0","xmm4");
3102
3103	&movdqa		("xmm3","xmm2");
3104	&pslldq		("xmm2",4);
3105	&pxor		("xmm3","xmm2");
3106	&pslldq		("xmm2",4);
3107	&pxor		("xmm3","xmm2");
3108	&pslldq		("xmm2",4);
3109	&pxor		("xmm2","xmm3");
3110
3111	&pxor		("xmm0","xmm2");
3112	&movdqu		(&QWP(16,$key),"xmm0");
3113
3114	&mov		($rounds,9);
3115	&mov		(&DWP(96,$key),$rounds);
3116
3117	&jmp	(&label("good_key"));
3118
3119&set_label("12rounds",16);
3120	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
3121	&cmp		("ebp",1<<28);
3122	&je		(&label("12rounds_alt"));
3123
3124	&mov		($rounds,11);
3125	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
3126	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
3127	&call		(&label("key_192a_cold"));
3128	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
3129	&call		(&label("key_192b"));
3130	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
3131	&call		(&label("key_192a"));
3132	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
3133	&call		(&label("key_192b"));
3134	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
3135	&call		(&label("key_192a"));
3136	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
3137	&call		(&label("key_192b"));
3138	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
3139	&call		(&label("key_192a"));
3140	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
3141	&call		(&label("key_192b"));
3142	&$movekey	(&QWP(0,$key),"xmm0");
3143	&mov		(&DWP(48,$key),$rounds);
3144
3145	&jmp	(&label("good_key"));
3146
3147&set_label("key_192a",16);
3148	&$movekey	(&QWP(0,$key),"xmm0");
3149	&lea		($key,&DWP(16,$key));
3150&set_label("key_192a_cold",16);
3151	&movaps		("xmm5","xmm2");
3152&set_label("key_192b_warm");
3153	&shufps		("xmm4","xmm0",0b00010000);
3154	&movdqa		("xmm3","xmm2");
3155	&xorps		("xmm0","xmm4");
3156	&shufps		("xmm4","xmm0",0b10001100);
3157	&pslldq		("xmm3",4);
3158	&xorps		("xmm0","xmm4");
3159	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
3160	&pxor		("xmm2","xmm3");
3161	&pxor		("xmm0","xmm1");
3162	&pshufd		("xmm3","xmm0",0b11111111);
3163	&pxor		("xmm2","xmm3");
3164	&ret();
3165
3166&set_label("key_192b",16);
3167	&movaps		("xmm3","xmm0");
3168	&shufps		("xmm5","xmm0",0b01000100);
3169	&$movekey	(&QWP(0,$key),"xmm5");
3170	&shufps		("xmm3","xmm2",0b01001110);
3171	&$movekey	(&QWP(16,$key),"xmm3");
3172	&lea		($key,&DWP(32,$key));
3173	&jmp		(&label("key_192b_warm"));
3174
3175&set_label("12rounds_alt",16);
3176	&movdqa		("xmm5",&QWP(0x10,"ebx"));
3177	&movdqa		("xmm4",&QWP(0x20,"ebx"));
3178	&mov		($rounds,8);
3179	&movdqu		(&QWP(-16,$key),"xmm0");
3180
3181&set_label("loop_key192");
3182	&movq		(&QWP(0,$key),"xmm2");
3183	&movdqa		("xmm1","xmm2");
3184	&pshufb		("xmm2","xmm5");
3185	&aesenclast	("xmm2","xmm4");
3186	&pslld		("xmm4",1);
3187	&lea		($key,&DWP(24,$key));
3188
3189	&movdqa		("xmm3","xmm0");
3190	&pslldq		("xmm0",4);
3191	&pxor		("xmm3","xmm0");
3192	&pslldq		("xmm0",4);
3193	&pxor		("xmm3","xmm0");
3194	&pslldq		("xmm0",4);
3195	&pxor		("xmm0","xmm3");
3196
3197	&pshufd		("xmm3","xmm0",0xff);
3198	&pxor		("xmm3","xmm1");
3199	&pslldq		("xmm1",4);
3200	&pxor		("xmm3","xmm1");
3201
3202	&pxor		("xmm0","xmm2");
3203	&pxor		("xmm2","xmm3");
3204	&movdqu		(&QWP(-16,$key),"xmm0");
3205
3206	&dec		($rounds);
3207	&jnz		(&label("loop_key192"));
3208
3209	&mov	($rounds,11);
3210	&mov	(&DWP(32,$key),$rounds);
3211
3212	&jmp	(&label("good_key"));
3213
3214&set_label("14rounds",16);
3215	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
3216	&lea		($key,&DWP(16,$key));
3217	&cmp		("ebp",1<<28);
3218	&je		(&label("14rounds_alt"));
3219
3220	&mov		($rounds,13);
3221	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
3222	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
3223	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
3224	&call		(&label("key_256a_cold"));
3225	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
3226	&call		(&label("key_256b"));
3227	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
3228	&call		(&label("key_256a"));
3229	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
3230	&call		(&label("key_256b"));
3231	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
3232	&call		(&label("key_256a"));
3233	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
3234	&call		(&label("key_256b"));
3235	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
3236	&call		(&label("key_256a"));
3237	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
3238	&call		(&label("key_256b"));
3239	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
3240	&call		(&label("key_256a"));
3241	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
3242	&call		(&label("key_256b"));
3243	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
3244	&call		(&label("key_256a"));
3245	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
3246	&call		(&label("key_256b"));
3247	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
3248	&call		(&label("key_256a"));
3249	&$movekey	(&QWP(0,$key),"xmm0");
3250	&mov		(&DWP(16,$key),$rounds);
3251	&xor		("eax","eax");
3252
3253	&jmp	(&label("good_key"));
3254
3255&set_label("key_256a",16);
3256	&$movekey	(&QWP(0,$key),"xmm2");
3257	&lea		($key,&DWP(16,$key));
3258&set_label("key_256a_cold");
3259	&shufps		("xmm4","xmm0",0b00010000);
3260	&xorps		("xmm0","xmm4");
3261	&shufps		("xmm4","xmm0",0b10001100);
3262	&xorps		("xmm0","xmm4");
3263	&shufps		("xmm1","xmm1",0b11111111);	# critical path
3264	&xorps		("xmm0","xmm1");
3265	&ret();
3266
3267&set_label("key_256b",16);
3268	&$movekey	(&QWP(0,$key),"xmm0");
3269	&lea		($key,&DWP(16,$key));
3270
3271	&shufps		("xmm4","xmm2",0b00010000);
3272	&xorps		("xmm2","xmm4");
3273	&shufps		("xmm4","xmm2",0b10001100);
3274	&xorps		("xmm2","xmm4");
3275	&shufps		("xmm1","xmm1",0b10101010);	# critical path
3276	&xorps		("xmm2","xmm1");
3277	&ret();
3278
3279&set_label("14rounds_alt",16);
3280	&movdqa		("xmm5",&QWP(0x00,"ebx"));
3281	&movdqa		("xmm4",&QWP(0x20,"ebx"));
3282	&mov		($rounds,7);
3283	&movdqu		(&QWP(-32,$key),"xmm0");
3284	&movdqa		("xmm1","xmm2");
3285	&movdqu		(&QWP(-16,$key),"xmm2");
3286
3287&set_label("loop_key256");
3288	&pshufb		("xmm2","xmm5");
3289	&aesenclast	("xmm2","xmm4");
3290
3291	&movdqa		("xmm3","xmm0");
3292	&pslldq		("xmm0",4);
3293	&pxor		("xmm3","xmm0");
3294	&pslldq		("xmm0",4);
3295	&pxor		("xmm3","xmm0");
3296	&pslldq		("xmm0",4);
3297	&pxor		("xmm0","xmm3");
3298	&pslld		("xmm4",1);
3299
3300	&pxor		("xmm0","xmm2");
3301	&movdqu		(&QWP(0,$key),"xmm0");
3302
3303	&dec		($rounds);
3304	&jz		(&label("done_key256"));
3305
3306	&pshufd		("xmm2","xmm0",0xff);
3307	&pxor		("xmm3","xmm3");
3308	&aesenclast	("xmm2","xmm3");
3309
3310	&movdqa		("xmm3","xmm1");
3311	&pslldq		("xmm1",4);
3312	&pxor		("xmm3","xmm1");
3313	&pslldq		("xmm1",4);
3314	&pxor		("xmm3","xmm1");
3315	&pslldq		("xmm1",4);
3316	&pxor		("xmm1","xmm3");
3317
3318	&pxor		("xmm2","xmm1");
3319	&movdqu		(&QWP(16,$key),"xmm2");
3320	&lea		($key,&DWP(32,$key));
3321	&movdqa		("xmm1","xmm2");
3322	&jmp		(&label("loop_key256"));
3323
3324&set_label("done_key256");
3325	&mov		($rounds,13);
3326	&mov		(&DWP(16,$key),$rounds);
3327
3328&set_label("good_key");
3329	&pxor	("xmm0","xmm0");
3330	&pxor	("xmm1","xmm1");
3331	&pxor	("xmm2","xmm2");
3332	&pxor	("xmm3","xmm3");
3333	&pxor	("xmm4","xmm4");
3334	&pxor	("xmm5","xmm5");
3335	&xor	("eax","eax");
3336	&pop	("ebx");
3337	&pop	("ebp");
3338	&ret	();
3339
3340&set_label("bad_pointer",4);
3341	&mov	("eax",-1);
3342	&pop	("ebx");
3343	&pop	("ebp");
3344	&ret	();
3345&set_label("bad_keybits",4);
3346	&pxor	("xmm0","xmm0");
3347	&mov	("eax",-2);
3348	&pop	("ebx");
3349	&pop	("ebp");
3350	&ret	();
3351&function_end_B("_aesni_set_encrypt_key");
3352
3353# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
3354#                              AES_KEY *key)
3355&function_begin_B("${PREFIX}_set_encrypt_key");
3356	&mov	("eax",&wparam(0));
3357	&mov	($rounds,&wparam(1));
3358	&mov	($key,&wparam(2));
3359	&call	("_aesni_set_encrypt_key");
3360	&ret	();
3361&function_end_B("${PREFIX}_set_encrypt_key");
3362
3363# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
3364#                              AES_KEY *key)
3365&function_begin_B("${PREFIX}_set_decrypt_key");
3366	&mov	("eax",&wparam(0));
3367	&mov	($rounds,&wparam(1));
3368	&mov	($key,&wparam(2));
3369	&call	("_aesni_set_encrypt_key");
3370	&mov	($key,&wparam(2));
3371	&shl	($rounds,4);	# rounds-1 after _aesni_set_encrypt_key
3372	&test	("eax","eax");
3373	&jnz	(&label("dec_key_ret"));
3374	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
3375
3376	&$movekey	("xmm0",&QWP(0,$key));	# just swap
3377	&$movekey	("xmm1",&QWP(0,"eax"));
3378	&$movekey	(&QWP(0,"eax"),"xmm0");
3379	&$movekey	(&QWP(0,$key),"xmm1");
3380	&lea		($key,&DWP(16,$key));
3381	&lea		("eax",&DWP(-16,"eax"));
3382
3383&set_label("dec_key_inverse");
3384	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
3385	&$movekey	("xmm1",&QWP(0,"eax"));
3386	&aesimc		("xmm0","xmm0");
3387	&aesimc		("xmm1","xmm1");
3388	&lea		($key,&DWP(16,$key));
3389	&lea		("eax",&DWP(-16,"eax"));
3390	&$movekey	(&QWP(16,"eax"),"xmm0");
3391	&$movekey	(&QWP(-16,$key),"xmm1");
3392	&cmp		("eax",$key);
3393	&ja		(&label("dec_key_inverse"));
3394
3395	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
3396	&aesimc		("xmm0","xmm0");
3397	&$movekey	(&QWP(0,$key),"xmm0");
3398
3399	&pxor		("xmm0","xmm0");
3400	&pxor		("xmm1","xmm1");
3401	&xor		("eax","eax");		# return success
3402&set_label("dec_key_ret");
3403	&ret	();
3404&function_end_B("${PREFIX}_set_decrypt_key");
3405
3406&set_label("key_const",64);
3407&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
3408&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
3409&data_word(1,1,1,1);
3410&data_word(0x1b,0x1b,0x1b,0x1b);
3411&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
3412
3413&asm_finish();
3414
3415close STDOUT or die "error closing STDOUT: $!";
3416