1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# Performance in cycles per byte processed with 128-bit key:
31#
32#		CBC enc		CBC dec		CTR
33# Apple A7	2.39		1.20		1.20
34# Cortex-A53	1.32		1.29		1.46
35# Cortex-A57(*)	1.95		0.85		0.93
36# Denver	1.96		0.86		0.80
37# Mongoose	1.33		1.20		1.20
38# Kryo		1.26		0.94		1.00
39#
40# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
41#	and are still same even for updated module;
42
43$flavour = shift;
44$output  = shift;
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49die "can't locate arm-xlate.pl";
50
51open OUT,"| \"$^X\" $xlate $flavour $output";
52*STDOUT=*OUT;
53
54$prefix="aes_v8";
55
56$code=<<___;
57#include "arm_arch.h"
58
59#if __ARM_MAX_ARCH__>=7
60.text
61___
62# $code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
63$code.=<<___						if ($flavour !~ /64/);
64.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
65.fpu	neon
66.code	32
67#undef	__thumb2__
68___
69
70# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
71# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
72# maintain both 32- and 64-bit codes within single module and
73# transliterate common code to either flavour with regex vodoo.
74#
75{{{
76my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
77my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
78	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
79
80
81$code.=<<___;
82.align	5
83.Lrcon:
84.long	0x01,0x01,0x01,0x01
85.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
86.long	0x1b,0x1b,0x1b,0x1b
87
88.globl	${prefix}_set_encrypt_key
89.type	${prefix}_set_encrypt_key,%function
90.align	5
91${prefix}_set_encrypt_key:
92.Lenc_key:
93___
94$code.=<<___	if ($flavour =~ /64/);
95	stp	x29,x30,[sp,#-16]!
96	add	x29,sp,#0
97___
98$code.=<<___;
99	mov	$ptr,#-1
100	cmp	$inp,#0
101	b.eq	.Lenc_key_abort
102	cmp	$out,#0
103	b.eq	.Lenc_key_abort
104	mov	$ptr,#-2
105	cmp	$bits,#128
106	b.lt	.Lenc_key_abort
107	cmp	$bits,#256
108	b.gt	.Lenc_key_abort
109	tst	$bits,#0x3f
110	b.ne	.Lenc_key_abort
111
112	adr	$ptr,.Lrcon
113	cmp	$bits,#192
114
115	veor	$zero,$zero,$zero
116	vld1.8	{$in0},[$inp],#16
117	mov	$bits,#8		// reuse $bits
118	vld1.32	{$rcon,$mask},[$ptr],#32
119
120	b.lt	.Loop128
121	b.eq	.L192
122	b	.L256
123
124.align	4
125.Loop128:
126	vtbl.8	$key,{$in0},$mask
127	vext.8	$tmp,$zero,$in0,#12
128	vst1.32	{$in0},[$out],#16
129	aese	$key,$zero
130	subs	$bits,$bits,#1
131
132	veor	$in0,$in0,$tmp
133	vext.8	$tmp,$zero,$tmp,#12
134	veor	$in0,$in0,$tmp
135	vext.8	$tmp,$zero,$tmp,#12
136	 veor	$key,$key,$rcon
137	veor	$in0,$in0,$tmp
138	vshl.u8	$rcon,$rcon,#1
139	veor	$in0,$in0,$key
140	b.ne	.Loop128
141
142	vld1.32	{$rcon},[$ptr]
143
144	vtbl.8	$key,{$in0},$mask
145	vext.8	$tmp,$zero,$in0,#12
146	vst1.32	{$in0},[$out],#16
147	aese	$key,$zero
148
149	veor	$in0,$in0,$tmp
150	vext.8	$tmp,$zero,$tmp,#12
151	veor	$in0,$in0,$tmp
152	vext.8	$tmp,$zero,$tmp,#12
153	 veor	$key,$key,$rcon
154	veor	$in0,$in0,$tmp
155	vshl.u8	$rcon,$rcon,#1
156	veor	$in0,$in0,$key
157
158	vtbl.8	$key,{$in0},$mask
159	vext.8	$tmp,$zero,$in0,#12
160	vst1.32	{$in0},[$out],#16
161	aese	$key,$zero
162
163	veor	$in0,$in0,$tmp
164	vext.8	$tmp,$zero,$tmp,#12
165	veor	$in0,$in0,$tmp
166	vext.8	$tmp,$zero,$tmp,#12
167	 veor	$key,$key,$rcon
168	veor	$in0,$in0,$tmp
169	veor	$in0,$in0,$key
170	vst1.32	{$in0},[$out]
171	add	$out,$out,#0x50
172
173	mov	$rounds,#10
174	b	.Ldone
175
176.align	4
177.L192:
178	vld1.8	{$in1},[$inp],#8
179	vmov.i8	$key,#8			// borrow $key
180	vst1.32	{$in0},[$out],#16
181	vsub.i8	$mask,$mask,$key	// adjust the mask
182
183.Loop192:
184	vtbl.8	$key,{$in1},$mask
185	vext.8	$tmp,$zero,$in0,#12
186#ifdef __ARMEB__
187	vst1.32	{$in1},[$out],#16
188	sub	$out,$out,#8
189#else
190	vst1.32	{$in1},[$out],#8
191#endif
192	aese	$key,$zero
193	subs	$bits,$bits,#1
194
195	veor	$in0,$in0,$tmp
196	vext.8	$tmp,$zero,$tmp,#12
197	veor	$in0,$in0,$tmp
198	vext.8	$tmp,$zero,$tmp,#12
199	veor	$in0,$in0,$tmp
200
201	vdup.32	$tmp,${in0}[3]
202	veor	$tmp,$tmp,$in1
203	 veor	$key,$key,$rcon
204	vext.8	$in1,$zero,$in1,#12
205	vshl.u8	$rcon,$rcon,#1
206	veor	$in1,$in1,$tmp
207	veor	$in0,$in0,$key
208	veor	$in1,$in1,$key
209	vst1.32	{$in0},[$out],#16
210	b.ne	.Loop192
211
212	mov	$rounds,#12
213	add	$out,$out,#0x20
214	b	.Ldone
215
216.align	4
217.L256:
218	vld1.8	{$in1},[$inp]
219	mov	$bits,#7
220	mov	$rounds,#14
221	vst1.32	{$in0},[$out],#16
222
223.Loop256:
224	vtbl.8	$key,{$in1},$mask
225	vext.8	$tmp,$zero,$in0,#12
226	vst1.32	{$in1},[$out],#16
227	aese	$key,$zero
228	subs	$bits,$bits,#1
229
230	veor	$in0,$in0,$tmp
231	vext.8	$tmp,$zero,$tmp,#12
232	veor	$in0,$in0,$tmp
233	vext.8	$tmp,$zero,$tmp,#12
234	 veor	$key,$key,$rcon
235	veor	$in0,$in0,$tmp
236	vshl.u8	$rcon,$rcon,#1
237	veor	$in0,$in0,$key
238	vst1.32	{$in0},[$out],#16
239	b.eq	.Ldone
240
241	vdup.32	$key,${in0}[3]		// just splat
242	vext.8	$tmp,$zero,$in1,#12
243	aese	$key,$zero
244
245	veor	$in1,$in1,$tmp
246	vext.8	$tmp,$zero,$tmp,#12
247	veor	$in1,$in1,$tmp
248	vext.8	$tmp,$zero,$tmp,#12
249	veor	$in1,$in1,$tmp
250
251	veor	$in1,$in1,$key
252	b	.Loop256
253
254.Ldone:
255	str	$rounds,[$out]
256	mov	$ptr,#0
257
258.Lenc_key_abort:
259	mov	x0,$ptr			// return value
260	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
261	ret
262.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
263
264.globl	${prefix}_set_decrypt_key
265.type	${prefix}_set_decrypt_key,%function
266.align	5
267${prefix}_set_decrypt_key:
268___
269$code.=<<___	if ($flavour =~ /64/);
270	.inst	0xd503233f		// paciasp
271	stp	x29,x30,[sp,#-16]!
272	add	x29,sp,#0
273___
274$code.=<<___	if ($flavour !~ /64/);
275	stmdb	sp!,{r4,lr}
276___
277$code.=<<___;
278	bl	.Lenc_key
279
280	cmp	x0,#0
281	b.ne	.Ldec_key_abort
282
283	sub	$out,$out,#240		// restore original $out
284	mov	x4,#-16
285	add	$inp,$out,x12,lsl#4	// end of key schedule
286
287	vld1.32	{v0.16b},[$out]
288	vld1.32	{v1.16b},[$inp]
289	vst1.32	{v0.16b},[$inp],x4
290	vst1.32	{v1.16b},[$out],#16
291
292.Loop_imc:
293	vld1.32	{v0.16b},[$out]
294	vld1.32	{v1.16b},[$inp]
295	aesimc	v0.16b,v0.16b
296	aesimc	v1.16b,v1.16b
297	vst1.32	{v0.16b},[$inp],x4
298	vst1.32	{v1.16b},[$out],#16
299	cmp	$inp,$out
300	b.hi	.Loop_imc
301
302	vld1.32	{v0.16b},[$out]
303	aesimc	v0.16b,v0.16b
304	vst1.32	{v0.16b},[$inp]
305
306	eor	x0,x0,x0		// return value
307.Ldec_key_abort:
308___
309$code.=<<___	if ($flavour !~ /64/);
310	ldmia	sp!,{r4,pc}
311___
312$code.=<<___	if ($flavour =~ /64/);
313	ldp	x29,x30,[sp],#16
314	.inst	0xd50323bf		// autiasp
315	ret
316___
317$code.=<<___;
318.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
319___
320}}}
321{{{
322sub gen_block () {
323my $dir = shift;
324my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
325my ($inp,$out,$key)=map("x$_",(0..2));
326my $rounds="w3";
327my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
328
329$code.=<<___;
330.globl	${prefix}_${dir}crypt
331.type	${prefix}_${dir}crypt,%function
332.align	5
333${prefix}_${dir}crypt:
334	ldr	$rounds,[$key,#240]
335	vld1.32	{$rndkey0},[$key],#16
336	vld1.8	{$inout},[$inp]
337	sub	$rounds,$rounds,#2
338	vld1.32	{$rndkey1},[$key],#16
339
340.Loop_${dir}c:
341	aes$e	$inout,$rndkey0
342	aes$mc	$inout,$inout
343	vld1.32	{$rndkey0},[$key],#16
344	subs	$rounds,$rounds,#2
345	aes$e	$inout,$rndkey1
346	aes$mc	$inout,$inout
347	vld1.32	{$rndkey1},[$key],#16
348	b.gt	.Loop_${dir}c
349
350	aes$e	$inout,$rndkey0
351	aes$mc	$inout,$inout
352	vld1.32	{$rndkey0},[$key]
353	aes$e	$inout,$rndkey1
354	veor	$inout,$inout,$rndkey0
355
356	vst1.8	{$inout},[$out]
357	ret
358.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
359___
360}
361&gen_block("en");
362&gen_block("de");
363}}}
364{{{
365my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
366my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
367my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
368
369my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
370my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
371
372### q8-q15	preloaded key schedule
373
374$code.=<<___;
375.globl	${prefix}_cbc_encrypt
376.type	${prefix}_cbc_encrypt,%function
377.align	5
378${prefix}_cbc_encrypt:
379___
380$code.=<<___	if ($flavour =~ /64/);
381	stp	x29,x30,[sp,#-16]!
382	add	x29,sp,#0
383___
384$code.=<<___	if ($flavour !~ /64/);
385	mov	ip,sp
386	stmdb	sp!,{r4-r8,lr}
387	vstmdb	sp!,{d8-d15}            @ ABI specification says so
388	ldmia	ip,{r4-r5}		@ load remaining args
389___
390$code.=<<___;
391	subs	$len,$len,#16
392	mov	$step,#16
393	b.lo	.Lcbc_abort
394	cclr	$step,eq
395
396	cmp	$enc,#0			// en- or decrypting?
397	ldr	$rounds,[$key,#240]
398	and	$len,$len,#-16
399	vld1.8	{$ivec},[$ivp]
400	vld1.8	{$dat},[$inp],$step
401
402	vld1.32	{q8-q9},[$key]		// load key schedule...
403	sub	$rounds,$rounds,#6
404	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
405	sub	$rounds,$rounds,#2
406	vld1.32	{q10-q11},[$key_],#32
407	vld1.32	{q12-q13},[$key_],#32
408	vld1.32	{q14-q15},[$key_],#32
409	vld1.32	{$rndlast},[$key_]
410
411	add	$key_,$key,#32
412	mov	$cnt,$rounds
413	b.eq	.Lcbc_dec
414
415	cmp	$rounds,#2
416	veor	$dat,$dat,$ivec
417	veor	$rndzero_n_last,q8,$rndlast
418	b.eq	.Lcbc_enc128
419
420	vld1.32	{$in0-$in1},[$key_]
421	add	$key_,$key,#16
422	add	$key4,$key,#16*4
423	add	$key5,$key,#16*5
424	aese	$dat,q8
425	aesmc	$dat,$dat
426	add	$key6,$key,#16*6
427	add	$key7,$key,#16*7
428	b	.Lenter_cbc_enc
429
430.align	4
431.Loop_cbc_enc:
432	aese	$dat,q8
433	aesmc	$dat,$dat
434	 vst1.8	{$ivec},[$out],#16
435.Lenter_cbc_enc:
436	aese	$dat,q9
437	aesmc	$dat,$dat
438	aese	$dat,$in0
439	aesmc	$dat,$dat
440	vld1.32	{q8},[$key4]
441	cmp	$rounds,#4
442	aese	$dat,$in1
443	aesmc	$dat,$dat
444	vld1.32	{q9},[$key5]
445	b.eq	.Lcbc_enc192
446
447	aese	$dat,q8
448	aesmc	$dat,$dat
449	vld1.32	{q8},[$key6]
450	aese	$dat,q9
451	aesmc	$dat,$dat
452	vld1.32	{q9},[$key7]
453	nop
454
455.Lcbc_enc192:
456	aese	$dat,q8
457	aesmc	$dat,$dat
458	 subs	$len,$len,#16
459	aese	$dat,q9
460	aesmc	$dat,$dat
461	 cclr	$step,eq
462	aese	$dat,q10
463	aesmc	$dat,$dat
464	aese	$dat,q11
465	aesmc	$dat,$dat
466	 vld1.8	{q8},[$inp],$step
467	aese	$dat,q12
468	aesmc	$dat,$dat
469	 veor	q8,q8,$rndzero_n_last
470	aese	$dat,q13
471	aesmc	$dat,$dat
472	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
473	aese	$dat,q14
474	aesmc	$dat,$dat
475	aese	$dat,q15
476	veor	$ivec,$dat,$rndlast
477	b.hs	.Loop_cbc_enc
478
479	vst1.8	{$ivec},[$out],#16
480	b	.Lcbc_done
481
482.align	5
483.Lcbc_enc128:
484	vld1.32	{$in0-$in1},[$key_]
485	aese	$dat,q8
486	aesmc	$dat,$dat
487	b	.Lenter_cbc_enc128
488.Loop_cbc_enc128:
489	aese	$dat,q8
490	aesmc	$dat,$dat
491	 vst1.8	{$ivec},[$out],#16
492.Lenter_cbc_enc128:
493	aese	$dat,q9
494	aesmc	$dat,$dat
495	 subs	$len,$len,#16
496	aese	$dat,$in0
497	aesmc	$dat,$dat
498	 cclr	$step,eq
499	aese	$dat,$in1
500	aesmc	$dat,$dat
501	aese	$dat,q10
502	aesmc	$dat,$dat
503	aese	$dat,q11
504	aesmc	$dat,$dat
505	 vld1.8	{q8},[$inp],$step
506	aese	$dat,q12
507	aesmc	$dat,$dat
508	aese	$dat,q13
509	aesmc	$dat,$dat
510	aese	$dat,q14
511	aesmc	$dat,$dat
512	 veor	q8,q8,$rndzero_n_last
513	aese	$dat,q15
514	veor	$ivec,$dat,$rndlast
515	b.hs	.Loop_cbc_enc128
516
517	vst1.8	{$ivec},[$out],#16
518	b	.Lcbc_done
519___
520{
521my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
522$code.=<<___;
523.align	5
524.Lcbc_dec:
525	vld1.8	{$dat2},[$inp],#16
526	subs	$len,$len,#32		// bias
527	add	$cnt,$rounds,#2
528	vorr	$in1,$dat,$dat
529	vorr	$dat1,$dat,$dat
530	vorr	$in2,$dat2,$dat2
531	b.lo	.Lcbc_dec_tail
532
533	vorr	$dat1,$dat2,$dat2
534	vld1.8	{$dat2},[$inp],#16
535	vorr	$in0,$dat,$dat
536	vorr	$in1,$dat1,$dat1
537	vorr	$in2,$dat2,$dat2
538
539.Loop3x_cbc_dec:
540	aesd	$dat0,q8
541	aesimc	$dat0,$dat0
542	aesd	$dat1,q8
543	aesimc	$dat1,$dat1
544	aesd	$dat2,q8
545	aesimc	$dat2,$dat2
546	vld1.32	{q8},[$key_],#16
547	subs	$cnt,$cnt,#2
548	aesd	$dat0,q9
549	aesimc	$dat0,$dat0
550	aesd	$dat1,q9
551	aesimc	$dat1,$dat1
552	aesd	$dat2,q9
553	aesimc	$dat2,$dat2
554	vld1.32	{q9},[$key_],#16
555	b.gt	.Loop3x_cbc_dec
556
557	aesd	$dat0,q8
558	aesimc	$dat0,$dat0
559	aesd	$dat1,q8
560	aesimc	$dat1,$dat1
561	aesd	$dat2,q8
562	aesimc	$dat2,$dat2
563	 veor	$tmp0,$ivec,$rndlast
564	 subs	$len,$len,#0x30
565	 veor	$tmp1,$in0,$rndlast
566	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
567	aesd	$dat0,q9
568	aesimc	$dat0,$dat0
569	aesd	$dat1,q9
570	aesimc	$dat1,$dat1
571	aesd	$dat2,q9
572	aesimc	$dat2,$dat2
573	 veor	$tmp2,$in1,$rndlast
574	 add	$inp,$inp,x6		// $inp is adjusted in such way that
575					// at exit from the loop $dat1-$dat2
576					// are loaded with last "words"
577	 vorr	$ivec,$in2,$in2
578	 mov	$key_,$key
579	aesd	$dat0,q12
580	aesimc	$dat0,$dat0
581	aesd	$dat1,q12
582	aesimc	$dat1,$dat1
583	aesd	$dat2,q12
584	aesimc	$dat2,$dat2
585	 vld1.8	{$in0},[$inp],#16
586	aesd	$dat0,q13
587	aesimc	$dat0,$dat0
588	aesd	$dat1,q13
589	aesimc	$dat1,$dat1
590	aesd	$dat2,q13
591	aesimc	$dat2,$dat2
592	 vld1.8	{$in1},[$inp],#16
593	aesd	$dat0,q14
594	aesimc	$dat0,$dat0
595	aesd	$dat1,q14
596	aesimc	$dat1,$dat1
597	aesd	$dat2,q14
598	aesimc	$dat2,$dat2
599	 vld1.8	{$in2},[$inp],#16
600	aesd	$dat0,q15
601	aesd	$dat1,q15
602	aesd	$dat2,q15
603	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
604	 add	$cnt,$rounds,#2
605	veor	$tmp0,$tmp0,$dat0
606	veor	$tmp1,$tmp1,$dat1
607	veor	$dat2,$dat2,$tmp2
608	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
609	vst1.8	{$tmp0},[$out],#16
610	 vorr	$dat0,$in0,$in0
611	vst1.8	{$tmp1},[$out],#16
612	 vorr	$dat1,$in1,$in1
613	vst1.8	{$dat2},[$out],#16
614	 vorr	$dat2,$in2,$in2
615	b.hs	.Loop3x_cbc_dec
616
617	cmn	$len,#0x30
618	b.eq	.Lcbc_done
619	nop
620
621.Lcbc_dec_tail:
622	aesd	$dat1,q8
623	aesimc	$dat1,$dat1
624	aesd	$dat2,q8
625	aesimc	$dat2,$dat2
626	vld1.32	{q8},[$key_],#16
627	subs	$cnt,$cnt,#2
628	aesd	$dat1,q9
629	aesimc	$dat1,$dat1
630	aesd	$dat2,q9
631	aesimc	$dat2,$dat2
632	vld1.32	{q9},[$key_],#16
633	b.gt	.Lcbc_dec_tail
634
635	aesd	$dat1,q8
636	aesimc	$dat1,$dat1
637	aesd	$dat2,q8
638	aesimc	$dat2,$dat2
639	aesd	$dat1,q9
640	aesimc	$dat1,$dat1
641	aesd	$dat2,q9
642	aesimc	$dat2,$dat2
643	aesd	$dat1,q12
644	aesimc	$dat1,$dat1
645	aesd	$dat2,q12
646	aesimc	$dat2,$dat2
647	 cmn	$len,#0x20
648	aesd	$dat1,q13
649	aesimc	$dat1,$dat1
650	aesd	$dat2,q13
651	aesimc	$dat2,$dat2
652	 veor	$tmp1,$ivec,$rndlast
653	aesd	$dat1,q14
654	aesimc	$dat1,$dat1
655	aesd	$dat2,q14
656	aesimc	$dat2,$dat2
657	 veor	$tmp2,$in1,$rndlast
658	aesd	$dat1,q15
659	aesd	$dat2,q15
660	b.eq	.Lcbc_dec_one
661	veor	$tmp1,$tmp1,$dat1
662	veor	$tmp2,$tmp2,$dat2
663	 vorr	$ivec,$in2,$in2
664	vst1.8	{$tmp1},[$out],#16
665	vst1.8	{$tmp2},[$out],#16
666	b	.Lcbc_done
667
668.Lcbc_dec_one:
669	veor	$tmp1,$tmp1,$dat2
670	 vorr	$ivec,$in2,$in2
671	vst1.8	{$tmp1},[$out],#16
672
673.Lcbc_done:
674	vst1.8	{$ivec},[$ivp]
675.Lcbc_abort:
676___
677}
678$code.=<<___	if ($flavour !~ /64/);
679	vldmia	sp!,{d8-d15}
680	ldmia	sp!,{r4-r8,pc}
681___
682$code.=<<___	if ($flavour =~ /64/);
683	ldr	x29,[sp],#16
684	ret
685___
686$code.=<<___;
687.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
688___
689}}}
690{{{
691my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
692my ($rounds,$cnt,$key_)=("w5","w6","x7");
693my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
694my $step="x12";		# aliases with $tctr2
695
696my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
697my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
698
699my ($dat,$tmp)=($dat0,$tmp0);
700
701### q8-q15	preloaded key schedule
702
703$code.=<<___;
704.globl	${prefix}_ctr32_encrypt_blocks
705.type	${prefix}_ctr32_encrypt_blocks,%function
706.align	5
707${prefix}_ctr32_encrypt_blocks:
708___
709$code.=<<___	if ($flavour =~ /64/);
710	stp		x29,x30,[sp,#-16]!
711	add		x29,sp,#0
712___
713$code.=<<___	if ($flavour !~ /64/);
714	mov		ip,sp
715	stmdb		sp!,{r4-r10,lr}
716	vstmdb		sp!,{d8-d15}            @ ABI specification says so
717	ldr		r4, [ip]		@ load remaining arg
718___
719$code.=<<___;
720	ldr		$rounds,[$key,#240]
721
722	ldr		$ctr, [$ivp, #12]
723#ifdef __ARMEB__
724	vld1.8		{$dat0},[$ivp]
725#else
726	vld1.32		{$dat0},[$ivp]
727#endif
728	vld1.32		{q8-q9},[$key]		// load key schedule...
729	sub		$rounds,$rounds,#4
730	mov		$step,#16
731	cmp		$len,#2
732	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
733	sub		$rounds,$rounds,#2
734	vld1.32		{q12-q13},[$key_],#32
735	vld1.32		{q14-q15},[$key_],#32
736	vld1.32		{$rndlast},[$key_]
737	add		$key_,$key,#32
738	mov		$cnt,$rounds
739	cclr		$step,lo
740#ifndef __ARMEB__
741	rev		$ctr, $ctr
742#endif
743	add		$tctr1, $ctr, #1
744	vorr		$ivec,$dat0,$dat0
745	rev		$tctr1, $tctr1
746	vmov.32		${ivec}[3],$tctr1
747	add		$ctr, $ctr, #2
748	vorr		$dat1,$ivec,$ivec
749	b.ls		.Lctr32_tail
750	rev		$tctr2, $ctr
751	vmov.32		${ivec}[3],$tctr2
752	sub		$len,$len,#3		// bias
753	vorr		$dat2,$ivec,$ivec
754	b		.Loop3x_ctr32
755
756.align	4
757.Loop3x_ctr32:
758	aese		$dat0,q8
759	aesmc		$dat0,$dat0
760	aese		$dat1,q8
761	aesmc		$dat1,$dat1
762	aese		$dat2,q8
763	aesmc		$dat2,$dat2
764	vld1.32		{q8},[$key_],#16
765	subs		$cnt,$cnt,#2
766	aese		$dat0,q9
767	aesmc		$dat0,$dat0
768	aese		$dat1,q9
769	aesmc		$dat1,$dat1
770	aese		$dat2,q9
771	aesmc		$dat2,$dat2
772	vld1.32		{q9},[$key_],#16
773	b.gt		.Loop3x_ctr32
774
775	aese		$dat0,q8
776	aesmc		$tmp0,$dat0
777	aese		$dat1,q8
778	aesmc		$tmp1,$dat1
779	 vld1.8		{$in0},[$inp],#16
780	 add		$tctr0,$ctr,#1
781	aese		$dat2,q8
782	aesmc		$dat2,$dat2
783	 vld1.8		{$in1},[$inp],#16
784	 rev		$tctr0,$tctr0
785	aese		$tmp0,q9
786	aesmc		$tmp0,$tmp0
787	aese		$tmp1,q9
788	aesmc		$tmp1,$tmp1
789	 vld1.8		{$in2},[$inp],#16
790	 mov		$key_,$key
791	aese		$dat2,q9
792	aesmc		$tmp2,$dat2
793	aese		$tmp0,q12
794	aesmc		$tmp0,$tmp0
795	aese		$tmp1,q12
796	aesmc		$tmp1,$tmp1
797	 veor		$in0,$in0,$rndlast
798	 add		$tctr1,$ctr,#2
799	aese		$tmp2,q12
800	aesmc		$tmp2,$tmp2
801	 veor		$in1,$in1,$rndlast
802	 add		$ctr,$ctr,#3
803	aese		$tmp0,q13
804	aesmc		$tmp0,$tmp0
805	aese		$tmp1,q13
806	aesmc		$tmp1,$tmp1
807	 veor		$in2,$in2,$rndlast
808	 vmov.32	${ivec}[3], $tctr0
809	aese		$tmp2,q13
810	aesmc		$tmp2,$tmp2
811	 vorr		$dat0,$ivec,$ivec
812	 rev		$tctr1,$tctr1
813	aese		$tmp0,q14
814	aesmc		$tmp0,$tmp0
815	 vmov.32	${ivec}[3], $tctr1
816	 rev		$tctr2,$ctr
817	aese		$tmp1,q14
818	aesmc		$tmp1,$tmp1
819	 vorr		$dat1,$ivec,$ivec
820	 vmov.32	${ivec}[3], $tctr2
821	aese		$tmp2,q14
822	aesmc		$tmp2,$tmp2
823	 vorr		$dat2,$ivec,$ivec
824	 subs		$len,$len,#3
825	aese		$tmp0,q15
826	aese		$tmp1,q15
827	aese		$tmp2,q15
828
829	veor		$in0,$in0,$tmp0
830	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
831	vst1.8		{$in0},[$out],#16
832	veor		$in1,$in1,$tmp1
833	 mov		$cnt,$rounds
834	vst1.8		{$in1},[$out],#16
835	veor		$in2,$in2,$tmp2
836	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
837	vst1.8		{$in2},[$out],#16
838	b.hs		.Loop3x_ctr32
839
840	adds		$len,$len,#3
841	b.eq		.Lctr32_done
842	cmp		$len,#1
843	mov		$step,#16
844	cclr		$step,eq
845
846.Lctr32_tail:
847	aese		$dat0,q8
848	aesmc		$dat0,$dat0
849	aese		$dat1,q8
850	aesmc		$dat1,$dat1
851	vld1.32		{q8},[$key_],#16
852	subs		$cnt,$cnt,#2
853	aese		$dat0,q9
854	aesmc		$dat0,$dat0
855	aese		$dat1,q9
856	aesmc		$dat1,$dat1
857	vld1.32		{q9},[$key_],#16
858	b.gt		.Lctr32_tail
859
860	aese		$dat0,q8
861	aesmc		$dat0,$dat0
862	aese		$dat1,q8
863	aesmc		$dat1,$dat1
864	aese		$dat0,q9
865	aesmc		$dat0,$dat0
866	aese		$dat1,q9
867	aesmc		$dat1,$dat1
868	 vld1.8		{$in0},[$inp],$step
869	aese		$dat0,q12
870	aesmc		$dat0,$dat0
871	aese		$dat1,q12
872	aesmc		$dat1,$dat1
873	 vld1.8		{$in1},[$inp]
874	aese		$dat0,q13
875	aesmc		$dat0,$dat0
876	aese		$dat1,q13
877	aesmc		$dat1,$dat1
878	 veor		$in0,$in0,$rndlast
879	aese		$dat0,q14
880	aesmc		$dat0,$dat0
881	aese		$dat1,q14
882	aesmc		$dat1,$dat1
883	 veor		$in1,$in1,$rndlast
884	aese		$dat0,q15
885	aese		$dat1,q15
886
887	cmp		$len,#1
888	veor		$in0,$in0,$dat0
889	veor		$in1,$in1,$dat1
890	vst1.8		{$in0},[$out],#16
891	b.eq		.Lctr32_done
892	vst1.8		{$in1},[$out]
893
894.Lctr32_done:
895___
896$code.=<<___	if ($flavour !~ /64/);
897	vldmia		sp!,{d8-d15}
898	ldmia		sp!,{r4-r10,pc}
899___
900$code.=<<___	if ($flavour =~ /64/);
901	ldr		x29,[sp],#16
902	ret
903___
904$code.=<<___;
905.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
906___
907}}}
908$code.=<<___;
909#endif
910___
911########################################
912if ($flavour =~ /64/) {			######## 64-bit code
913    my %opcode = (
914	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
915	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
916
917    local *unaes = sub {
918	my ($mnemonic,$arg)=@_;
919
920	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
921	sprintf ".inst\t0x%08x\t//%s %s",
922			$opcode{$mnemonic}|$1|($2<<5),
923			$mnemonic,$arg;
924    };
925
926    foreach(split("\n",$code)) {
927	s/\`([^\`]*)\`/eval($1)/geo;
928
929	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
930	s/@\s/\/\//o;			# old->new style commentary
931
932	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
933	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
934	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
935	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
936	s/vext\.8/ext/o		or
937	s/vrev32\.8/rev32/o	or
938	s/vtst\.8/cmtst/o	or
939	s/vshr/ushr/o		or
940	s/^(\s+)v/$1/o		or	# strip off v prefix
941	s/\bbx\s+lr\b/ret/o;
942
943	# fix up remaining legacy suffixes
944	s/\.[ui]?8//o;
945	m/\],#8/o and s/\.16b/\.8b/go;
946	s/\.[ui]?32//o and s/\.16b/\.4s/go;
947	s/\.[ui]?64//o and s/\.16b/\.2d/go;
948	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
949
950	print $_,"\n";
951    }
952} else {				######## 32-bit code
953    my %opcode = (
954	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
955	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
956
957    local *unaes = sub {
958	my ($mnemonic,$arg)=@_;
959
960	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
961	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
962					 |(($2&7)<<1) |(($2&8)<<2);
963	    # since ARMv7 instructions are always encoded little-endian.
964	    # correct solution is to use .inst directive, but older
965	    # assemblers don't implement it:-(
966	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
967			$word&0xff,($word>>8)&0xff,
968			($word>>16)&0xff,($word>>24)&0xff,
969			$mnemonic,$arg;
970	}
971    };
972
973    sub unvtbl {
974	my $arg=shift;
975
976	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
977	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
978		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
979    }
980
981    sub unvdup32 {
982	my $arg=shift;
983
984	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
985	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
986    }
987
988    sub unvmov32 {
989	my $arg=shift;
990
991	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
992	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
993    }
994
995    foreach(split("\n",$code)) {
996	s/\`([^\`]*)\`/eval($1)/geo;
997
998	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
999	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
1000	s/\/\/\s?/@ /o;				# new->old style commentary
1001
1002	# fix up remaining new-style suffixes
1003	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
1004	s/\],#[0-9]+/]!/o;
1005
1006	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
1007	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
1008	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
1009	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
1010	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
1011	s/^(\s+)b\./$1b/o				or
1012	s/^(\s+)mov\./$1mov/o				or
1013	s/^(\s+)ret/$1bx\tlr/o;
1014
1015	print $_,"\n";
1016    }
1017}
1018
1019close STDOUT or die "error closing STDOUT: $!";
1020