1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13# details].
14
15$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
16			# generates drop-in replacement for
17			# crypto/aes/asm/aes-586.pl:-)
18$inline=1;		# inline _aesni_[en|de]crypt
19
20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21push(@INC,"${dir}","${dir}../../perlasm");
22require "x86asm.pl";
23
24&asm_init($ARGV[0],$0);
25
26$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
27
28$len="eax";
29$rounds="ecx";
30$key="edx";
31$inp="esi";
32$out="edi";
33$rounds_="ebx";	# backup copy for $rounds
34$key_="ebp";	# backup copy for $key
35
36$inout0="xmm0";
37$inout1="xmm1";
38$inout2="xmm2";
39$rndkey0="xmm3";
40$rndkey1="xmm4";
41$ivec="xmm5";
42$in0="xmm6";
43$in1="xmm7";	$inout3="xmm7";
44
45# Inline version of internal aesni_[en|de]crypt1
46sub aesni_inline_generate1
47{ my $p=shift;
48
49    &$movekey		($rndkey0,&QWP(0,$key));
50    &$movekey		($rndkey1,&QWP(16,$key));
51    &lea		($key,&DWP(32,$key));
52    &pxor		($inout0,$rndkey0);
53    &set_label("${p}1_loop");
54	eval"&aes${p}	($inout0,$rndkey1)";
55	&dec		($rounds);
56	&$movekey	($rndkey1,&QWP(0,$key));
57	&lea		($key,&DWP(16,$key));
58    &jnz		(&label("${p}1_loop"));
59    eval"&aes${p}last	($inout0,$rndkey1)";
60}
61
62sub aesni_generate1	# fully unrolled loop
63{ my $p=shift;
64
65    &function_begin_B("_aesni_${p}rypt1");
66	&$movekey	($rndkey0,&QWP(0,$key));
67	&$movekey	($rndkey1,&QWP(0x10,$key));
68	&cmp		($rounds,11);
69	&pxor		($inout0,$rndkey0);
70	&$movekey	($rndkey0,&QWP(0x20,$key));
71	&lea		($key,&DWP(0x30,$key));
72	&jb		(&label("${p}128"));
73	&lea		($key,&DWP(0x20,$key));
74	&je		(&label("${p}192"));
75	&lea		($key,&DWP(0x20,$key));
76	eval"&aes${p}	($inout0,$rndkey1)";
77	&$movekey	($rndkey1,&QWP(-0x40,$key));
78	eval"&aes${p}	($inout0,$rndkey0)";
79	&$movekey	($rndkey0,&QWP(-0x30,$key));
80    &set_label("${p}192");
81	eval"&aes${p}	($inout0,$rndkey1)";
82	&$movekey	($rndkey1,&QWP(-0x20,$key));
83	eval"&aes${p}	($inout0,$rndkey0)";
84	&$movekey	($rndkey0,&QWP(-0x10,$key));
85    &set_label("${p}128");
86	eval"&aes${p}	($inout0,$rndkey1)";
87	&$movekey	($rndkey1,&QWP(0,$key));
88	eval"&aes${p}	($inout0,$rndkey0)";
89	&$movekey	($rndkey0,&QWP(0x10,$key));
90	eval"&aes${p}	($inout0,$rndkey1)";
91	&$movekey	($rndkey1,&QWP(0x20,$key));
92	eval"&aes${p}	($inout0,$rndkey0)";
93	&$movekey	($rndkey0,&QWP(0x30,$key));
94	eval"&aes${p}	($inout0,$rndkey1)";
95	&$movekey	($rndkey1,&QWP(0x40,$key));
96	eval"&aes${p}	($inout0,$rndkey0)";
97	&$movekey	($rndkey0,&QWP(0x50,$key));
98	eval"&aes${p}	($inout0,$rndkey1)";
99	&$movekey	($rndkey1,&QWP(0x60,$key));
100	eval"&aes${p}	($inout0,$rndkey0)";
101	&$movekey	($rndkey0,&QWP(0x70,$key));
102	eval"&aes${p}	($inout0,$rndkey1)";
103    eval"&aes${p}last	($inout0,$rndkey0)";
104    &ret();
105    &function_end_B("_aesni_${p}rypt1");
106}
107
108# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
109&aesni_generate1("enc") if (!$inline);
110&function_begin_B("${PREFIX}_encrypt");
111	&mov	("eax",&wparam(0));
112	&mov	($key,&wparam(2));
113	&movups	($inout0,&QWP(0,"eax"));
114	&mov	($rounds,&DWP(240,$key));
115	&mov	("eax",&wparam(1));
116	if ($inline)
117	{   &aesni_inline_generate1("enc");	}
118	else
119	{   &call	("_aesni_encrypt1");	}
120	&movups	(&QWP(0,"eax"),$inout0);
121	&ret	();
122&function_end_B("${PREFIX}_encrypt");
123
124# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
125&aesni_generate1("dec") if(!$inline);
126&function_begin_B("${PREFIX}_decrypt");
127	&mov	("eax",&wparam(0));
128	&mov	($key,&wparam(2));
129	&movups	($inout0,&QWP(0,"eax"));
130	&mov	($rounds,&DWP(240,$key));
131	&mov	("eax",&wparam(1));
132	if ($inline)
133	{   &aesni_inline_generate1("dec");	}
134	else
135	{   &call	("_aesni_decrypt1");	}
136	&movups	(&QWP(0,"eax"),$inout0);
137	&ret	();
138&function_end_B("${PREFIX}_decrypt");
139
140# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
141# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
142# latency is 6, it turned out that it can be scheduled only every
143# *second* cycle. Thus 3x interleave is the one providing optimal
144# utilization, i.e. when subroutine's throughput is virtually same as
145# of non-interleaved subroutine [for number of input blocks up to 3].
146# This is why it makes no sense to implement 2x subroutine. As soon
147# as/if Intel improves throughput by making it possible to schedule
148# the instructions in question *every* cycles I would have to
149# implement 6x interleave and use it in loop...
150sub aesni_generate3
151{ my $p=shift;
152
153    &function_begin_B("_aesni_${p}rypt3");
154	&$movekey	($rndkey0,&QWP(0,$key));
155	&shr		($rounds,1);
156	&$movekey	($rndkey1,&QWP(16,$key));
157	&lea		($key,&DWP(32,$key));
158	&pxor		($inout0,$rndkey0);
159	&pxor		($inout1,$rndkey0);
160	&pxor		($inout2,$rndkey0);
161	&jmp		(&label("${p}3_loop"));
162    &set_label("${p}3_loop",16);
163	eval"&aes${p}	($inout0,$rndkey1)";
164	&$movekey	($rndkey0,&QWP(0,$key));
165	eval"&aes${p}	($inout1,$rndkey1)";
166	&dec		($rounds);
167	eval"&aes${p}	($inout2,$rndkey1)";
168	&$movekey	($rndkey1,&QWP(16,$key));
169	eval"&aes${p}	($inout0,$rndkey0)";
170	&lea		($key,&DWP(32,$key));
171	eval"&aes${p}	($inout1,$rndkey0)";
172	eval"&aes${p}	($inout2,$rndkey0)";
173	&jnz		(&label("${p}3_loop"));
174    eval"&aes${p}	($inout0,$rndkey1)";
175    &$movekey		($rndkey0,&QWP(0,$key));
176    eval"&aes${p}	($inout1,$rndkey1)";
177    eval"&aes${p}	($inout2,$rndkey1)";
178    eval"&aes${p}last	($inout0,$rndkey0)";
179    eval"&aes${p}last	($inout1,$rndkey0)";
180    eval"&aes${p}last	($inout2,$rndkey0)";
181    &ret();
182    &function_end_B("_aesni_${p}rypt3");
183}
184
185# 4x interleave is implemented to improve small block performance,
186# most notably [and naturally] 4 block by ~30%. One can argue that one
187# should have implemented 5x as well, but improvement  would be <20%,
188# so it's not worth it...
189sub aesni_generate4
190{ my $p=shift;
191
192    &function_begin_B("_aesni_${p}rypt4");
193	&$movekey	($rndkey0,&QWP(0,$key));
194	&$movekey	($rndkey1,&QWP(16,$key));
195	&shr		($rounds,1);
196	&lea		($key,&DWP(32,$key));
197	&pxor		($inout0,$rndkey0);
198	&pxor		($inout1,$rndkey0);
199	&pxor		($inout2,$rndkey0);
200	&pxor		($inout3,$rndkey0);
201	&jmp		(&label("${p}3_loop"));
202    &set_label("${p}3_loop",16);
203	eval"&aes${p}	($inout0,$rndkey1)";
204	&$movekey	($rndkey0,&QWP(0,$key));
205	eval"&aes${p}	($inout1,$rndkey1)";
206	&dec		($rounds);
207	eval"&aes${p}	($inout2,$rndkey1)";
208	eval"&aes${p}	($inout3,$rndkey1)";
209	&$movekey	($rndkey1,&QWP(16,$key));
210	eval"&aes${p}	($inout0,$rndkey0)";
211	&lea		($key,&DWP(32,$key));
212	eval"&aes${p}	($inout1,$rndkey0)";
213	eval"&aes${p}	($inout2,$rndkey0)";
214	eval"&aes${p}	($inout3,$rndkey0)";
215	&jnz		(&label("${p}3_loop"));
216    eval"&aes${p}	($inout0,$rndkey1)";
217    &$movekey		($rndkey0,&QWP(0,$key));
218    eval"&aes${p}	($inout1,$rndkey1)";
219    eval"&aes${p}	($inout2,$rndkey1)";
220    eval"&aes${p}	($inout3,$rndkey1)";
221    eval"&aes${p}last	($inout0,$rndkey0)";
222    eval"&aes${p}last	($inout1,$rndkey0)";
223    eval"&aes${p}last	($inout2,$rndkey0)";
224    eval"&aes${p}last	($inout3,$rndkey0)";
225    &ret();
226    &function_end_B("_aesni_${p}rypt4");
227}
228&aesni_generate3("enc") if ($PREFIX eq "aesni");
229&aesni_generate3("dec");
230&aesni_generate4("enc") if ($PREFIX eq "aesni");
231&aesni_generate4("dec");
232
233if ($PREFIX eq "aesni") {
234# void aesni_ecb_encrypt (const void *in, void *out,
235#                         size_t length, const AES_KEY *key,
236#                         int enc);
237&function_begin("aesni_ecb_encrypt");
238	&mov	($inp,&wparam(0));
239	&mov	($out,&wparam(1));
240	&mov	($len,&wparam(2));
241	&mov	($key,&wparam(3));
242	&mov	($rounds,&wparam(4));
243	&cmp	($len,16);
244	&jb	(&label("ecb_ret"));
245	&and	($len,-16);
246	&test	($rounds,$rounds)
247	&mov	($rounds,&DWP(240,$key));
248	&mov	($key_,$key);		# backup $key
249	&mov	($rounds_,$rounds);	# backup $rounds
250	&jz	(&label("ecb_decrypt"));
251
252	&sub	($len,0x40);
253	&jbe	(&label("ecb_enc_tail"));
254	&jmp	(&label("ecb_enc_loop3"));
255
256&set_label("ecb_enc_loop3",16);
257	&movups	($inout0,&QWP(0,$inp));
258	&movups	($inout1,&QWP(0x10,$inp));
259	&movups	($inout2,&QWP(0x20,$inp));
260	&call	("_aesni_encrypt3");
261	&sub	($len,0x30);
262	&lea	($inp,&DWP(0x30,$inp));
263	&lea	($out,&DWP(0x30,$out));
264	&movups	(&QWP(-0x30,$out),$inout0);
265	&mov	($key,$key_);		# restore $key
266	&movups	(&QWP(-0x20,$out),$inout1);
267	&mov	($rounds,$rounds_);	# restore $rounds
268	&movups	(&QWP(-0x10,$out),$inout2);
269	&ja	(&label("ecb_enc_loop3"));
270
271&set_label("ecb_enc_tail");
272	&add	($len,0x40);
273	&jz	(&label("ecb_ret"));
274
275	&cmp	($len,0x10);
276	&movups	($inout0,&QWP(0,$inp));
277	&je	(&label("ecb_enc_one"));
278	&cmp	($len,0x20);
279	&movups	($inout1,&QWP(0x10,$inp));
280	&je	(&label("ecb_enc_two"));
281	&cmp	($len,0x30);
282	&movups	($inout2,&QWP(0x20,$inp));
283	&je	(&label("ecb_enc_three"));
284	&movups	($inout3,&QWP(0x30,$inp));
285	&call	("_aesni_encrypt4");
286	&movups	(&QWP(0,$out),$inout0);
287	&movups	(&QWP(0x10,$out),$inout1);
288	&movups	(&QWP(0x20,$out),$inout2);
289	&movups	(&QWP(0x30,$out),$inout3);
290	jmp	(&label("ecb_ret"));
291
292&set_label("ecb_enc_one",16);
293	if ($inline)
294	{   &aesni_inline_generate1("enc");	}
295	else
296	{   &call	("_aesni_encrypt1");	}
297	&movups	(&QWP(0,$out),$inout0);
298	&jmp	(&label("ecb_ret"));
299
300&set_label("ecb_enc_two",16);
301	&call	("_aesni_encrypt3");
302	&movups	(&QWP(0,$out),$inout0);
303	&movups	(&QWP(0x10,$out),$inout1);
304	&jmp	(&label("ecb_ret"));
305
306&set_label("ecb_enc_three",16);
307	&call	("_aesni_encrypt3");
308	&movups	(&QWP(0,$out),$inout0);
309	&movups	(&QWP(0x10,$out),$inout1);
310	&movups	(&QWP(0x20,$out),$inout2);
311	&jmp	(&label("ecb_ret"));
312
313&set_label("ecb_decrypt",16);
314	&sub	($len,0x40);
315	&jbe	(&label("ecb_dec_tail"));
316	&jmp	(&label("ecb_dec_loop3"));
317
318&set_label("ecb_dec_loop3",16);
319	&movups	($inout0,&QWP(0,$inp));
320	&movups	($inout1,&QWP(0x10,$inp));
321	&movups	($inout2,&QWP(0x20,$inp));
322	&call	("_aesni_decrypt3");
323	&sub	($len,0x30);
324	&lea	($inp,&DWP(0x30,$inp));
325	&lea	($out,&DWP(0x30,$out));
326	&movups	(&QWP(-0x30,$out),$inout0);
327	&mov	($key,$key_);		# restore $key
328	&movups	(&QWP(-0x20,$out),$inout1);
329	&mov	($rounds,$rounds_);	# restore $rounds
330	&movups	(&QWP(-0x10,$out),$inout2);
331	&ja	(&label("ecb_dec_loop3"));
332
333&set_label("ecb_dec_tail");
334	&add	($len,0x40);
335	&jz	(&label("ecb_ret"));
336
337	&cmp	($len,0x10);
338	&movups	($inout0,&QWP(0,$inp));
339	&je	(&label("ecb_dec_one"));
340	&cmp	($len,0x20);
341	&movups	($inout1,&QWP(0x10,$inp));
342	&je	(&label("ecb_dec_two"));
343	&cmp	($len,0x30);
344	&movups	($inout2,&QWP(0x20,$inp));
345	&je	(&label("ecb_dec_three"));
346	&movups	($inout3,&QWP(0x30,$inp));
347	&call	("_aesni_decrypt4");
348	&movups	(&QWP(0,$out),$inout0);
349	&movups	(&QWP(0x10,$out),$inout1);
350	&movups	(&QWP(0x20,$out),$inout2);
351	&movups	(&QWP(0x30,$out),$inout3);
352	&jmp	(&label("ecb_ret"));
353
354&set_label("ecb_dec_one",16);
355	if ($inline)
356	{   &aesni_inline_generate1("dec");	}
357	else
358	{   &call	("_aesni_decrypt1");	}
359	&movups	(&QWP(0,$out),$inout0);
360	&jmp	(&label("ecb_ret"));
361
362&set_label("ecb_dec_two",16);
363	&call	("_aesni_decrypt3");
364	&movups	(&QWP(0,$out),$inout0);
365	&movups	(&QWP(0x10,$out),$inout1);
366	&jmp	(&label("ecb_ret"));
367
368&set_label("ecb_dec_three",16);
369	&call	("_aesni_decrypt3");
370	&movups	(&QWP(0,$out),$inout0);
371	&movups	(&QWP(0x10,$out),$inout1);
372	&movups	(&QWP(0x20,$out),$inout2);
373
374&set_label("ecb_ret");
375&function_end("aesni_ecb_encrypt");
376}
377
378# void $PREFIX_cbc_encrypt (const void *inp, void *out,
379#                           size_t length, const AES_KEY *key,
380#                           unsigned char *ivp,const int enc);
381&function_begin("${PREFIX}_cbc_encrypt");
382	&mov	($inp,&wparam(0));
383	&mov	($out,&wparam(1));
384	&mov	($len,&wparam(2));
385	&mov	($key,&wparam(3));
386	&test	($len,$len);
387	&mov	($key_,&wparam(4));
388	&jz	(&label("cbc_ret"));
389
390	&cmp	(&wparam(5),0);
391	&movups	($ivec,&QWP(0,$key_));	# load IV
392	&mov	($rounds,&DWP(240,$key));
393	&mov	($key_,$key);		# backup $key
394	&mov	($rounds_,$rounds);	# backup $rounds
395	&je	(&label("cbc_decrypt"));
396
397	&movaps	($inout0,$ivec);
398	&cmp	($len,16);
399	&jb	(&label("cbc_enc_tail"));
400	&sub	($len,16);
401	&jmp	(&label("cbc_enc_loop"));
402
403&set_label("cbc_enc_loop",16);
404	&movups	($ivec,&QWP(0,$inp));
405	&lea	($inp,&DWP(16,$inp));
406	&pxor	($inout0,$ivec);
407	if ($inline)
408	{   &aesni_inline_generate1("enc");	}
409	else
410	{   &call	("_aesni_encrypt1");	}
411	&sub	($len,16);
412	&lea	($out,&DWP(16,$out));
413	&mov	($rounds,$rounds_);	# restore $rounds
414	&mov	($key,$key_);		# restore $key
415	&movups	(&QWP(-16,$out),$inout0);
416	&jnc	(&label("cbc_enc_loop"));
417	&add	($len,16);
418	&jnz	(&label("cbc_enc_tail"));
419	&movaps	($ivec,$inout0);
420	&jmp	(&label("cbc_ret"));
421
422&set_label("cbc_enc_tail");
423	&mov	("ecx",$len);		# zaps $rounds
424	&data_word(0xA4F3F689);		# rep movsb
425	&mov	("ecx",16);		# zero tail
426	&sub	("ecx",$len);
427	&xor	("eax","eax");		# zaps $len
428	&data_word(0xAAF3F689);		# rep stosb
429	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
430	&mov	($rounds,$rounds_);	# restore $rounds
431	&mov	($inp,$out);		# $inp and $out are the same
432	&mov	($key,$key_);		# restore $key
433	&jmp	(&label("cbc_enc_loop"));
434
435&set_label("cbc_decrypt",16);
436	&sub	($len,0x40);
437	&jbe	(&label("cbc_dec_tail"));
438	&jmp	(&label("cbc_dec_loop3"));
439
440&set_label("cbc_dec_loop3",16);
441	&movups	($inout0,&QWP(0,$inp));
442	&movups	($inout1,&QWP(0x10,$inp));
443	&movups	($inout2,&QWP(0x20,$inp));
444	&movaps	($in0,$inout0);
445	&movaps	($in1,$inout1);
446	&call	("_aesni_decrypt3");
447	&sub	($len,0x30);
448	&lea	($inp,&DWP(0x30,$inp));
449	&lea	($out,&DWP(0x30,$out));
450	&pxor	($inout0,$ivec);
451	&pxor	($inout1,$in0);
452	&movups	($ivec,&QWP(-0x10,$inp));
453	&pxor	($inout2,$in1);
454	&movups	(&QWP(-0x30,$out),$inout0);
455	&mov	($rounds,$rounds_)	# restore $rounds
456	&movups	(&QWP(-0x20,$out),$inout1);
457	&mov	($key,$key_);		# restore $key
458	&movups	(&QWP(-0x10,$out),$inout2);
459	&ja	(&label("cbc_dec_loop3"));
460
461&set_label("cbc_dec_tail");
462	&add	($len,0x40);
463	&jz	(&label("cbc_ret"));
464
465	&movups	($inout0,&QWP(0,$inp));
466	&cmp	($len,0x10);
467	&movaps	($in0,$inout0);
468	&jbe	(&label("cbc_dec_one"));
469	&movups	($inout1,&QWP(0x10,$inp));
470	&cmp	($len,0x20);
471	&movaps	($in1,$inout1);
472	&jbe	(&label("cbc_dec_two"));
473	&movups	($inout2,&QWP(0x20,$inp));
474	&cmp	($len,0x30);
475	&jbe	(&label("cbc_dec_three"));
476	&movups	($inout3,&QWP(0x30,$inp));
477	&call	("_aesni_decrypt4");
478	&movups	($rndkey0,&QWP(0x10,$inp));
479	&movups	($rndkey1,&QWP(0x20,$inp));
480	&pxor	($inout0,$ivec);
481	&pxor	($inout1,$in0);
482	&movups	($ivec,&QWP(0x30,$inp));
483	&movups	(&QWP(0,$out),$inout0);
484	&pxor	($inout2,$rndkey0);
485	&pxor	($inout3,$rndkey1);
486	&movups	(&QWP(0x10,$out),$inout1);
487	&movups	(&QWP(0x20,$out),$inout2);
488	&movaps	($inout0,$inout3);
489	&lea	($out,&DWP(0x30,$out));
490	&jmp	(&label("cbc_dec_tail_collected"));
491
492&set_label("cbc_dec_one");
493	if ($inline)
494	{   &aesni_inline_generate1("dec");	}
495	else
496	{   &call	("_aesni_decrypt1");	}
497	&pxor	($inout0,$ivec);
498	&movaps	($ivec,$in0);
499	&jmp	(&label("cbc_dec_tail_collected"));
500
501&set_label("cbc_dec_two");
502	&call	("_aesni_decrypt3");
503	&pxor	($inout0,$ivec);
504	&pxor	($inout1,$in0);
505	&movups	(&QWP(0,$out),$inout0);
506	&movaps	($inout0,$inout1);
507	&movaps	($ivec,$in1);
508	&lea	($out,&DWP(0x10,$out));
509	&jmp	(&label("cbc_dec_tail_collected"));
510
511&set_label("cbc_dec_three");
512	&call	("_aesni_decrypt3");
513	&pxor	($inout0,$ivec);
514	&pxor	($inout1,$in0);
515	&pxor	($inout2,$in1);
516	&movups	(&QWP(0,$out),$inout0);
517	&movups	(&QWP(0x10,$out),$inout1);
518	&movaps	($inout0,$inout2);
519	&movups	($ivec,&QWP(0x20,$inp));
520	&lea	($out,&DWP(0x20,$out));
521
522&set_label("cbc_dec_tail_collected");
523	&and	($len,15);
524	&jnz	(&label("cbc_dec_tail_partial"));
525	&movups	(&QWP(0,$out),$inout0);
526	&jmp	(&label("cbc_ret"));
527
528&set_label("cbc_dec_tail_partial");
529	&mov	($key_,"esp");
530	&sub	("esp",16);
531	&and	("esp",-16);
532	&movaps	(&QWP(0,"esp"),$inout0);
533	&mov	($inp,"esp");
534	&mov	("ecx",$len);
535	&data_word(0xA4F3F689);		# rep movsb
536	&mov	("esp",$key_);
537
538&set_label("cbc_ret");
539	&mov	($key_,&wparam(4));
540	&movups	(&QWP(0,$key_),$ivec);	# output IV
541&function_end("${PREFIX}_cbc_encrypt");
542
543# Mechanical port from aesni-x86_64.pl.
544#
545# _aesni_set_encrypt_key is private interface,
546# input:
547#	"eax"	const unsigned char *userKey
548#	$rounds	int bits
549#	$key	AES_KEY *key
550# output:
551#	"eax"	return code
552#	$round	rounds
553
554&function_begin_B("_aesni_set_encrypt_key");
555	&test	("eax","eax");
556	&jz	(&label("bad_pointer"));
557	&test	($key,$key);
558	&jz	(&label("bad_pointer"));
559
560	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
561	&pxor	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
562	&lea	($key,&DWP(16,$key));
563	&cmp	($rounds,256);
564	&je	(&label("14rounds"));
565	&cmp	($rounds,192);
566	&je	(&label("12rounds"));
567	&cmp	($rounds,128);
568	&jne	(&label("bad_keybits"));
569
570&set_label("10rounds",16);
571	&mov		($rounds,9);
572	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
573	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
574	&call		(&label("key_128_cold"));
575	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
576	&call		(&label("key_128"));
577	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
578	&call		(&label("key_128"));
579	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
580	&call		(&label("key_128"));
581	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
582	&call		(&label("key_128"));
583	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
584	&call		(&label("key_128"));
585	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
586	&call		(&label("key_128"));
587	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
588	&call		(&label("key_128"));
589	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
590	&call		(&label("key_128"));
591	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
592	&call		(&label("key_128"));
593	&$movekey	(&QWP(0,$key),"xmm0");
594	&mov		(&DWP(80,$key),$rounds);
595	&xor		("eax","eax");
596	&ret();
597
598&set_label("key_128",16);
599	&$movekey	(&QWP(0,$key),"xmm0");
600	&lea		($key,&DWP(16,$key));
601&set_label("key_128_cold");
602	&shufps		("xmm4","xmm0",0b00010000);
603	&pxor		("xmm0","xmm4");
604	&shufps		("xmm4","xmm0",0b10001100,);
605	&pxor		("xmm0","xmm4");
606	&pshufd		("xmm1","xmm1",0b11111111);	# critical path
607	&pxor		("xmm0","xmm1");
608	&ret();
609
610&set_label("12rounds",16);
611	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
612	&mov		($rounds,11);
613	&$movekey	(&QWP(-16,$key),"xmm0")		# round 0
614	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
615	&call		(&label("key_192a_cold"));
616	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
617	&call		(&label("key_192b"));
618	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
619	&call		(&label("key_192a"));
620	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
621	&call		(&label("key_192b"));
622	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
623	&call		(&label("key_192a"));
624	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
625	&call		(&label("key_192b"));
626	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
627	&call		(&label("key_192a"));
628	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
629	&call		(&label("key_192b"));
630	&$movekey	(&QWP(0,$key),"xmm0");
631	&mov		(&DWP(48,$key),$rounds);
632	&xor		("eax","eax");
633	&ret();
634
635&set_label("key_192a",16);
636	&$movekey	(&QWP(0,$key),"xmm0");
637	&lea		($key,&DWP(16,$key));
638&set_label("key_192a_cold",16);
639	&movaps		("xmm5","xmm2");
640&set_label("key_192b_warm");
641	&shufps		("xmm4","xmm0",0b00010000);
642	&movaps		("xmm3","xmm2");
643	&pxor		("xmm0","xmm4");
644	&shufps		("xmm4","xmm0",0b10001100);
645	&pslldq		("xmm3",4);
646	&pxor		("xmm0","xmm4");
647	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
648	&pxor		("xmm2","xmm3");
649	&pxor		("xmm0","xmm1");
650	&pshufd		("xmm3","xmm0",0b11111111);
651	&pxor		("xmm2","xmm3");
652	&ret();
653
654&set_label("key_192b",16);
655	&movaps		("xmm3","xmm0");
656	&shufps		("xmm5","xmm0",0b01000100);
657	&$movekey	(&QWP(0,$key),"xmm5");
658	&shufps		("xmm3","xmm2",0b01001110);
659	&$movekey	(&QWP(16,$key),"xmm3");
660	&lea		($key,&DWP(32,$key));
661	&jmp		(&label("key_192b_warm"));
662
663&set_label("14rounds",16);
664	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
665	&mov		($rounds,13);
666	&lea		($key,&DWP(16,$key));
667	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
668	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
669	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
670	&call		(&label("key_256a_cold"));
671	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
672	&call		(&label("key_256b"));
673	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
674	&call		(&label("key_256a"));
675	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
676	&call		(&label("key_256b"));
677	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
678	&call		(&label("key_256a"));
679	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
680	&call		(&label("key_256b"));
681	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
682	&call		(&label("key_256a"));
683	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
684	&call		(&label("key_256b"));
685	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
686	&call		(&label("key_256a"));
687	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
688	&call		(&label("key_256b"));
689	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
690	&call		(&label("key_256a"));
691	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
692	&call		(&label("key_256b"));
693	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
694	&call		(&label("key_256a"));
695	&$movekey	(&QWP(0,$key),"xmm0");
696	&mov		(&DWP(16,$key),$rounds);
697	&xor		("eax","eax");
698	&ret();
699
700&set_label("key_256a",16);
701	&$movekey	(&QWP(0,$key),"xmm2");
702	&lea		($key,&DWP(16,$key));
703&set_label("key_256a_cold");
704	&shufps		("xmm4","xmm0",0b00010000);
705	&pxor		("xmm0","xmm4");
706	&shufps		("xmm4","xmm0",0b10001100);
707	&pxor		("xmm0","xmm4");
708	&pshufd		("xmm1","xmm1",0b11111111);	# critical path
709	&pxor		("xmm0","xmm1");
710	&ret();
711
712&set_label("key_256b",16);
713	&$movekey	(&QWP(0,$key),"xmm0");
714	&lea		($key,&DWP(16,$key));
715
716	&shufps		("xmm4","xmm2",0b00010000);
717	&pxor		("xmm2","xmm4");
718	&shufps		("xmm4","xmm2",0b10001100);
719	&pxor		("xmm2","xmm4");
720	&pshufd		("xmm1","xmm1",0b10101010);	# critical path
721	&pxor		("xmm2","xmm1");
722	&ret();
723
724&set_label("bad_pointer",4);
725	&mov	("eax",-1);
726	&ret	();
727&set_label("bad_keybits",4);
728	&mov	("eax",-2);
729	&ret	();
730&function_end_B("_aesni_set_encrypt_key");
731
732# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
733#                              AES_KEY *key)
734&function_begin_B("${PREFIX}_set_encrypt_key");
735	&mov	("eax",&wparam(0));
736	&mov	($rounds,&wparam(1));
737	&mov	($key,&wparam(2));
738	&call	("_aesni_set_encrypt_key");
739	&ret	();
740&function_end_B("${PREFIX}_set_encrypt_key");
741
742# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
743#                              AES_KEY *key)
744&function_begin_B("${PREFIX}_set_decrypt_key");
745	&mov	("eax",&wparam(0));
746	&mov	($rounds,&wparam(1));
747	&mov	($key,&wparam(2));
748	&call	("_aesni_set_encrypt_key");
749	&mov	($key,&wparam(2));
750	&shl	($rounds,4)	# rounds-1 after _aesni_set_encrypt_key
751	&test	("eax","eax");
752	&jnz	(&label("dec_key_ret"));
753	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
754
755	&$movekey	("xmm0",&QWP(0,$key));	# just swap
756	&$movekey	("xmm1",&QWP(0,"eax"));
757	&$movekey	(&QWP(0,"eax"),"xmm0");
758	&$movekey	(&QWP(0,$key),"xmm1");
759	&lea		($key,&DWP(16,$key));
760	&lea		("eax",&DWP(-16,"eax"));
761
762&set_label("dec_key_inverse");
763	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
764	&$movekey	("xmm1",&QWP(0,"eax"));
765	&aesimc		("xmm0","xmm0");
766	&aesimc		("xmm1","xmm1");
767	&lea		($key,&DWP(16,$key));
768	&lea		("eax",&DWP(-16,"eax"));
769	&cmp		("eax",$key);
770	&$movekey	(&QWP(16,"eax"),"xmm0");
771	&$movekey	(&QWP(-16,$key),"xmm1");
772	&ja		(&label("dec_key_inverse"));
773
774	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
775	&aesimc		("xmm0","xmm0");
776	&$movekey	(&QWP(0,$key),"xmm0");
777
778	&xor		("eax","eax");		# return success
779&set_label("dec_key_ret");
780	&ret	();
781&function_end_B("${PREFIX}_set_decrypt_key");
782&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
783
784&asm_finish();
785