1#! /usr/bin/env perl
2# Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# Performance in cycles per byte processed with 128-bit key:
31#
32#		CBC enc		CBC dec		CTR
33# Apple A7	2.39		1.20		1.20
34# Cortex-A53	1.32		1.29		1.46
35# Cortex-A57(*)	1.95		0.85		0.93
36# Denver	1.96		0.86		0.80
37# Mongoose	1.33		1.20		1.20
38# Kryo		1.26		0.94		1.00
39#
40# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
41#	and are still same even for updated module;
42
43$flavour = shift;
44$output  = shift;
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49die "can't locate arm-xlate.pl";
50
51open OUT,"| \"$^X\" $xlate $flavour $output";
52*STDOUT=*OUT;
53
54$prefix="aes_v8";
55
56$code=<<___;
57#include "arm_arch.h"
58
59#if __ARM_MAX_ARCH__>=7
60.text
61___
62# $code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
63$code.=<<___						if ($flavour !~ /64/);
64.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
65.fpu	neon
66.code	32
67#undef	__thumb2__
68___
69
70# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
71# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
72# maintain both 32- and 64-bit codes within single module and
73# transliterate common code to either flavour with regex vodoo.
74#
75{{{
76my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
77my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
78	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
79
80
81$code.=<<___;
82.align	5
83.Lrcon:
84.long	0x01,0x01,0x01,0x01
85.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
86.long	0x1b,0x1b,0x1b,0x1b
87
88.globl	${prefix}_set_encrypt_key
89.type	${prefix}_set_encrypt_key,%function
90.align	5
91${prefix}_set_encrypt_key:
92.Lenc_key:
93___
94$code.=<<___	if ($flavour =~ /64/);
95	stp	x29,x30,[sp,#-16]!
96	add	x29,sp,#0
97___
98$code.=<<___;
99	mov	$ptr,#-1
100	cmp	$inp,#0
101	b.eq	.Lenc_key_abort
102	cmp	$out,#0
103	b.eq	.Lenc_key_abort
104	mov	$ptr,#-2
105	cmp	$bits,#128
106	b.lt	.Lenc_key_abort
107	cmp	$bits,#256
108	b.gt	.Lenc_key_abort
109	tst	$bits,#0x3f
110	b.ne	.Lenc_key_abort
111
112	adr	$ptr,.Lrcon
113	cmp	$bits,#192
114
115	veor	$zero,$zero,$zero
116	vld1.8	{$in0},[$inp],#16
117	mov	$bits,#8		// reuse $bits
118	vld1.32	{$rcon,$mask},[$ptr],#32
119
120	b.lt	.Loop128
121	b.eq	.L192
122	b	.L256
123
124.align	4
125.Loop128:
126	vtbl.8	$key,{$in0},$mask
127	vext.8	$tmp,$zero,$in0,#12
128	vst1.32	{$in0},[$out],#16
129	aese	$key,$zero
130	subs	$bits,$bits,#1
131
132	veor	$in0,$in0,$tmp
133	vext.8	$tmp,$zero,$tmp,#12
134	veor	$in0,$in0,$tmp
135	vext.8	$tmp,$zero,$tmp,#12
136	 veor	$key,$key,$rcon
137	veor	$in0,$in0,$tmp
138	vshl.u8	$rcon,$rcon,#1
139	veor	$in0,$in0,$key
140	b.ne	.Loop128
141
142	vld1.32	{$rcon},[$ptr]
143
144	vtbl.8	$key,{$in0},$mask
145	vext.8	$tmp,$zero,$in0,#12
146	vst1.32	{$in0},[$out],#16
147	aese	$key,$zero
148
149	veor	$in0,$in0,$tmp
150	vext.8	$tmp,$zero,$tmp,#12
151	veor	$in0,$in0,$tmp
152	vext.8	$tmp,$zero,$tmp,#12
153	 veor	$key,$key,$rcon
154	veor	$in0,$in0,$tmp
155	vshl.u8	$rcon,$rcon,#1
156	veor	$in0,$in0,$key
157
158	vtbl.8	$key,{$in0},$mask
159	vext.8	$tmp,$zero,$in0,#12
160	vst1.32	{$in0},[$out],#16
161	aese	$key,$zero
162
163	veor	$in0,$in0,$tmp
164	vext.8	$tmp,$zero,$tmp,#12
165	veor	$in0,$in0,$tmp
166	vext.8	$tmp,$zero,$tmp,#12
167	 veor	$key,$key,$rcon
168	veor	$in0,$in0,$tmp
169	veor	$in0,$in0,$key
170	vst1.32	{$in0},[$out]
171	add	$out,$out,#0x50
172
173	mov	$rounds,#10
174	b	.Ldone
175
176.align	4
177.L192:
178	vld1.8	{$in1},[$inp],#8
179	vmov.i8	$key,#8			// borrow $key
180	vst1.32	{$in0},[$out],#16
181	vsub.i8	$mask,$mask,$key	// adjust the mask
182
183.Loop192:
184	vtbl.8	$key,{$in1},$mask
185	vext.8	$tmp,$zero,$in0,#12
186#ifdef __ARMEB__
187	vst1.32	{$in1},[$out],#16
188	sub	$out,$out,#8
189#else
190	vst1.32	{$in1},[$out],#8
191#endif
192	aese	$key,$zero
193	subs	$bits,$bits,#1
194
195	veor	$in0,$in0,$tmp
196	vext.8	$tmp,$zero,$tmp,#12
197	veor	$in0,$in0,$tmp
198	vext.8	$tmp,$zero,$tmp,#12
199	veor	$in0,$in0,$tmp
200
201	vdup.32	$tmp,${in0}[3]
202	veor	$tmp,$tmp,$in1
203	 veor	$key,$key,$rcon
204	vext.8	$in1,$zero,$in1,#12
205	vshl.u8	$rcon,$rcon,#1
206	veor	$in1,$in1,$tmp
207	veor	$in0,$in0,$key
208	veor	$in1,$in1,$key
209	vst1.32	{$in0},[$out],#16
210	b.ne	.Loop192
211
212	mov	$rounds,#12
213	add	$out,$out,#0x20
214	b	.Ldone
215
216.align	4
217.L256:
218	vld1.8	{$in1},[$inp]
219	mov	$bits,#7
220	mov	$rounds,#14
221	vst1.32	{$in0},[$out],#16
222
223.Loop256:
224	vtbl.8	$key,{$in1},$mask
225	vext.8	$tmp,$zero,$in0,#12
226	vst1.32	{$in1},[$out],#16
227	aese	$key,$zero
228	subs	$bits,$bits,#1
229
230	veor	$in0,$in0,$tmp
231	vext.8	$tmp,$zero,$tmp,#12
232	veor	$in0,$in0,$tmp
233	vext.8	$tmp,$zero,$tmp,#12
234	 veor	$key,$key,$rcon
235	veor	$in0,$in0,$tmp
236	vshl.u8	$rcon,$rcon,#1
237	veor	$in0,$in0,$key
238	vst1.32	{$in0},[$out],#16
239	b.eq	.Ldone
240
241	vdup.32	$key,${in0}[3]		// just splat
242	vext.8	$tmp,$zero,$in1,#12
243	aese	$key,$zero
244
245	veor	$in1,$in1,$tmp
246	vext.8	$tmp,$zero,$tmp,#12
247	veor	$in1,$in1,$tmp
248	vext.8	$tmp,$zero,$tmp,#12
249	veor	$in1,$in1,$tmp
250
251	veor	$in1,$in1,$key
252	b	.Loop256
253
254.Ldone:
255	str	$rounds,[$out]
256	mov	$ptr,#0
257
258.Lenc_key_abort:
259	mov	x0,$ptr			// return value
260	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
261	ret
262.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
263
264.globl	${prefix}_set_decrypt_key
265.type	${prefix}_set_decrypt_key,%function
266.align	5
267${prefix}_set_decrypt_key:
268___
269$code.=<<___	if ($flavour =~ /64/);
270	.inst	0xd503233f		// paciasp
271	stp	x29,x30,[sp,#-16]!
272	add	x29,sp,#0
273___
274$code.=<<___	if ($flavour !~ /64/);
275	stmdb	sp!,{r4,lr}
276___
277$code.=<<___;
278	bl	.Lenc_key
279
280	cmp	x0,#0
281	b.ne	.Ldec_key_abort
282
283	sub	$out,$out,#240		// restore original $out
284	mov	x4,#-16
285	add	$inp,$out,x12,lsl#4	// end of key schedule
286
287	vld1.32	{v0.16b},[$out]
288	vld1.32	{v1.16b},[$inp]
289	vst1.32	{v0.16b},[$inp],x4
290	vst1.32	{v1.16b},[$out],#16
291
292.Loop_imc:
293	vld1.32	{v0.16b},[$out]
294	vld1.32	{v1.16b},[$inp]
295	aesimc	v0.16b,v0.16b
296	aesimc	v1.16b,v1.16b
297	vst1.32	{v0.16b},[$inp],x4
298	vst1.32	{v1.16b},[$out],#16
299	cmp	$inp,$out
300	b.hi	.Loop_imc
301
302	vld1.32	{v0.16b},[$out]
303	aesimc	v0.16b,v0.16b
304	vst1.32	{v0.16b},[$inp]
305
306	eor	x0,x0,x0		// return value
307.Ldec_key_abort:
308___
309$code.=<<___	if ($flavour !~ /64/);
310	ldmia	sp!,{r4,pc}
311___
312$code.=<<___	if ($flavour =~ /64/);
313	ldp	x29,x30,[sp],#16
314	.inst	0xd50323bf		// autiasp
315	ret
316___
317$code.=<<___;
318.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
319___
320}}}
321{{{
322sub gen_block () {
323my $dir = shift;
324my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
325my ($inp,$out,$key)=map("x$_",(0..2));
326my $rounds="w3";
327my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
328
329$code.=<<___;
330.globl	${prefix}_${dir}crypt
331.type	${prefix}_${dir}crypt,%function
332.align	5
333${prefix}_${dir}crypt:
334	ldr	$rounds,[$key,#240]
335	vld1.32	{$rndkey0},[$key],#16
336	vld1.8	{$inout},[$inp]
337	sub	$rounds,$rounds,#2
338	vld1.32	{$rndkey1},[$key],#16
339
340.Loop_${dir}c:
341	aes$e	$inout,$rndkey0
342	aes$mc	$inout,$inout
343	vld1.32	{$rndkey0},[$key],#16
344	subs	$rounds,$rounds,#2
345	aes$e	$inout,$rndkey1
346	aes$mc	$inout,$inout
347	vld1.32	{$rndkey1},[$key],#16
348	b.gt	.Loop_${dir}c
349
350	aes$e	$inout,$rndkey0
351	aes$mc	$inout,$inout
352	vld1.32	{$rndkey0},[$key]
353	aes$e	$inout,$rndkey1
354	veor	$inout,$inout,$rndkey0
355
356	vst1.8	{$inout},[$out]
357	ret
358.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
359___
360}
361&gen_block("en");
362&gen_block("de");
363}}}
364{{{
365my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
366my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
367my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
368
369my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
370my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
371
372### q8-q15	preloaded key schedule
373
374$code.=<<___;
375.globl	${prefix}_cbc_encrypt
376.type	${prefix}_cbc_encrypt,%function
377.align	5
378${prefix}_cbc_encrypt:
379___
380$code.=<<___	if ($flavour =~ /64/);
381	stp	x29,x30,[sp,#-16]!
382	add	x29,sp,#0
383___
384$code.=<<___	if ($flavour !~ /64/);
385	mov	ip,sp
386	stmdb	sp!,{r4-r8,lr}
387	vstmdb	sp!,{d8-d15}            @ ABI specification says so
388	ldmia	ip,{r4-r5}		@ load remaining args
389___
390$code.=<<___;
391	subs	$len,$len,#16
392	mov	$step,#16
393	b.lo	.Lcbc_abort
394	cclr	$step,eq
395
396	cmp	$enc,#0			// en- or decrypting?
397	ldr	$rounds,[$key,#240]
398	and	$len,$len,#-16
399	vld1.8	{$ivec},[$ivp]
400	vld1.8	{$dat},[$inp],$step
401
402	vld1.32	{q8-q9},[$key]		// load key schedule...
403	sub	$rounds,$rounds,#6
404	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
405	sub	$rounds,$rounds,#2
406	vld1.32	{q10-q11},[$key_],#32
407	vld1.32	{q12-q13},[$key_],#32
408	vld1.32	{q14-q15},[$key_],#32
409	vld1.32	{$rndlast},[$key_]
410
411	add	$key_,$key,#32
412	mov	$cnt,$rounds
413	b.eq	.Lcbc_dec
414
415	cmp	$rounds,#2
416	veor	$dat,$dat,$ivec
417	veor	$rndzero_n_last,q8,$rndlast
418	b.eq	.Lcbc_enc128
419
420	vld1.32	{$in0-$in1},[$key_]
421	add	$key_,$key,#16
422	add	$key4,$key,#16*4
423	add	$key5,$key,#16*5
424	aese	$dat,q8
425	aesmc	$dat,$dat
426	add	$key6,$key,#16*6
427	add	$key7,$key,#16*7
428	b	.Lenter_cbc_enc
429
430.align	4
431.Loop_cbc_enc:
432	aese	$dat,q8
433	aesmc	$dat,$dat
434	 vst1.8	{$ivec},[$out],#16
435.Lenter_cbc_enc:
436	aese	$dat,q9
437	aesmc	$dat,$dat
438	aese	$dat,$in0
439	aesmc	$dat,$dat
440	vld1.32	{q8},[$key4]
441	cmp	$rounds,#4
442	aese	$dat,$in1
443	aesmc	$dat,$dat
444	vld1.32	{q9},[$key5]
445	b.eq	.Lcbc_enc192
446
447	aese	$dat,q8
448	aesmc	$dat,$dat
449	vld1.32	{q8},[$key6]
450	aese	$dat,q9
451	aesmc	$dat,$dat
452	vld1.32	{q9},[$key7]
453	nop
454
455.Lcbc_enc192:
456	aese	$dat,q8
457	aesmc	$dat,$dat
458	 subs	$len,$len,#16
459	aese	$dat,q9
460	aesmc	$dat,$dat
461	 cclr	$step,eq
462	aese	$dat,q10
463	aesmc	$dat,$dat
464	aese	$dat,q11
465	aesmc	$dat,$dat
466	 vld1.8	{q8},[$inp],$step
467	aese	$dat,q12
468	aesmc	$dat,$dat
469	 veor	q8,q8,$rndzero_n_last
470	aese	$dat,q13
471	aesmc	$dat,$dat
472	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
473	aese	$dat,q14
474	aesmc	$dat,$dat
475	aese	$dat,q15
476	veor	$ivec,$dat,$rndlast
477	b.hs	.Loop_cbc_enc
478
479	vst1.8	{$ivec},[$out],#16
480	b	.Lcbc_done
481
482.align	5
483.Lcbc_enc128:
484	vld1.32	{$in0-$in1},[$key_]
485	aese	$dat,q8
486	aesmc	$dat,$dat
487	b	.Lenter_cbc_enc128
488.Loop_cbc_enc128:
489	aese	$dat,q8
490	aesmc	$dat,$dat
491	 vst1.8	{$ivec},[$out],#16
492.Lenter_cbc_enc128:
493	aese	$dat,q9
494	aesmc	$dat,$dat
495	 subs	$len,$len,#16
496	aese	$dat,$in0
497	aesmc	$dat,$dat
498	 cclr	$step,eq
499	aese	$dat,$in1
500	aesmc	$dat,$dat
501	aese	$dat,q10
502	aesmc	$dat,$dat
503	aese	$dat,q11
504	aesmc	$dat,$dat
505	 vld1.8	{q8},[$inp],$step
506	aese	$dat,q12
507	aesmc	$dat,$dat
508	aese	$dat,q13
509	aesmc	$dat,$dat
510	aese	$dat,q14
511	aesmc	$dat,$dat
512	 veor	q8,q8,$rndzero_n_last
513	aese	$dat,q15
514	veor	$ivec,$dat,$rndlast
515	b.hs	.Loop_cbc_enc128
516
517	vst1.8	{$ivec},[$out],#16
518	b	.Lcbc_done
519___
520{
521my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
522$code.=<<___;
523.align	5
524.Lcbc_dec:
525	vld1.8	{$dat2},[$inp],#16
526	subs	$len,$len,#32		// bias
527	add	$cnt,$rounds,#2
528	vorr	$in1,$dat,$dat
529	vorr	$dat1,$dat,$dat
530	vorr	$in2,$dat2,$dat2
531	b.lo	.Lcbc_dec_tail
532
533	vorr	$dat1,$dat2,$dat2
534	vld1.8	{$dat2},[$inp],#16
535	vorr	$in0,$dat,$dat
536	vorr	$in1,$dat1,$dat1
537	vorr	$in2,$dat2,$dat2
538
539.Loop3x_cbc_dec:
540	aesd	$dat0,q8
541	aesimc	$dat0,$dat0
542	aesd	$dat1,q8
543	aesimc	$dat1,$dat1
544	aesd	$dat2,q8
545	aesimc	$dat2,$dat2
546	vld1.32	{q8},[$key_],#16
547	subs	$cnt,$cnt,#2
548	aesd	$dat0,q9
549	aesimc	$dat0,$dat0
550	aesd	$dat1,q9
551	aesimc	$dat1,$dat1
552	aesd	$dat2,q9
553	aesimc	$dat2,$dat2
554	vld1.32	{q9},[$key_],#16
555	b.gt	.Loop3x_cbc_dec
556
557	aesd	$dat0,q8
558	aesimc	$dat0,$dat0
559	aesd	$dat1,q8
560	aesimc	$dat1,$dat1
561	aesd	$dat2,q8
562	aesimc	$dat2,$dat2
563	 veor	$tmp0,$ivec,$rndlast
564	 subs	$len,$len,#0x30
565	 veor	$tmp1,$in0,$rndlast
566	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
567	aesd	$dat0,q9
568	aesimc	$dat0,$dat0
569	aesd	$dat1,q9
570	aesimc	$dat1,$dat1
571	aesd	$dat2,q9
572	aesimc	$dat2,$dat2
573	 veor	$tmp2,$in1,$rndlast
574	 add	$inp,$inp,x6		// $inp is adjusted in such way that
575					// at exit from the loop $dat1-$dat2
576					// are loaded with last "words"
577	 vorr	$ivec,$in2,$in2
578	 mov	$key_,$key
579	aesd	$dat0,q12
580	aesimc	$dat0,$dat0
581	aesd	$dat1,q12
582	aesimc	$dat1,$dat1
583	aesd	$dat2,q12
584	aesimc	$dat2,$dat2
585	 vld1.8	{$in0},[$inp],#16
586	aesd	$dat0,q13
587	aesimc	$dat0,$dat0
588	aesd	$dat1,q13
589	aesimc	$dat1,$dat1
590	aesd	$dat2,q13
591	aesimc	$dat2,$dat2
592	 vld1.8	{$in1},[$inp],#16
593	aesd	$dat0,q14
594	aesimc	$dat0,$dat0
595	aesd	$dat1,q14
596	aesimc	$dat1,$dat1
597	aesd	$dat2,q14
598	aesimc	$dat2,$dat2
599	 vld1.8	{$in2},[$inp],#16
600	aesd	$dat0,q15
601	aesd	$dat1,q15
602	aesd	$dat2,q15
603	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
604	 add	$cnt,$rounds,#2
605	veor	$tmp0,$tmp0,$dat0
606	veor	$tmp1,$tmp1,$dat1
607	veor	$dat2,$dat2,$tmp2
608	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
609	vst1.8	{$tmp0},[$out],#16
610	 vorr	$dat0,$in0,$in0
611	vst1.8	{$tmp1},[$out],#16
612	 vorr	$dat1,$in1,$in1
613	vst1.8	{$dat2},[$out],#16
614	 vorr	$dat2,$in2,$in2
615	b.hs	.Loop3x_cbc_dec
616
617	cmn	$len,#0x30
618	b.eq	.Lcbc_done
619	nop
620
621.Lcbc_dec_tail:
622	aesd	$dat1,q8
623	aesimc	$dat1,$dat1
624	aesd	$dat2,q8
625	aesimc	$dat2,$dat2
626	vld1.32	{q8},[$key_],#16
627	subs	$cnt,$cnt,#2
628	aesd	$dat1,q9
629	aesimc	$dat1,$dat1
630	aesd	$dat2,q9
631	aesimc	$dat2,$dat2
632	vld1.32	{q9},[$key_],#16
633	b.gt	.Lcbc_dec_tail
634
635	aesd	$dat1,q8
636	aesimc	$dat1,$dat1
637	aesd	$dat2,q8
638	aesimc	$dat2,$dat2
639	aesd	$dat1,q9
640	aesimc	$dat1,$dat1
641	aesd	$dat2,q9
642	aesimc	$dat2,$dat2
643	aesd	$dat1,q12
644	aesimc	$dat1,$dat1
645	aesd	$dat2,q12
646	aesimc	$dat2,$dat2
647	 cmn	$len,#0x20
648	aesd	$dat1,q13
649	aesimc	$dat1,$dat1
650	aesd	$dat2,q13
651	aesimc	$dat2,$dat2
652	 veor	$tmp1,$ivec,$rndlast
653	aesd	$dat1,q14
654	aesimc	$dat1,$dat1
655	aesd	$dat2,q14
656	aesimc	$dat2,$dat2
657	 veor	$tmp2,$in1,$rndlast
658	aesd	$dat1,q15
659	aesd	$dat2,q15
660	b.eq	.Lcbc_dec_one
661	veor	$tmp1,$tmp1,$dat1
662	veor	$tmp2,$tmp2,$dat2
663	 vorr	$ivec,$in2,$in2
664	vst1.8	{$tmp1},[$out],#16
665	vst1.8	{$tmp2},[$out],#16
666	b	.Lcbc_done
667
668.Lcbc_dec_one:
669	veor	$tmp1,$tmp1,$dat2
670	 vorr	$ivec,$in2,$in2
671	vst1.8	{$tmp1},[$out],#16
672
673.Lcbc_done:
674	vst1.8	{$ivec},[$ivp]
675.Lcbc_abort:
676___
677}
678$code.=<<___	if ($flavour !~ /64/);
679	vldmia	sp!,{d8-d15}
680	ldmia	sp!,{r4-r8,pc}
681___
682$code.=<<___	if ($flavour =~ /64/);
683	ldr	x29,[sp],#16
684	ret
685___
686$code.=<<___;
687.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
688___
689}}}
690{{{
691my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
692my ($rounds,$cnt,$key_)=("w5","w6","x7");
693my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
694my $step="x12";		# aliases with $tctr2
695
696my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
697my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
698
699my ($dat,$tmp)=($dat0,$tmp0);
700
701### q8-q15	preloaded key schedule
702
703$code.=<<___;
704.globl	${prefix}_ctr32_encrypt_blocks
705.type	${prefix}_ctr32_encrypt_blocks,%function
706.align	5
707${prefix}_ctr32_encrypt_blocks:
708___
709$code.=<<___	if ($flavour =~ /64/);
710	stp		x29,x30,[sp,#-16]!
711	add		x29,sp,#0
712___
713$code.=<<___	if ($flavour !~ /64/);
714	mov		ip,sp
715	stmdb		sp!,{r4-r10,lr}
716	vstmdb		sp!,{d8-d15}            @ ABI specification says so
717	ldr		r4, [ip]		@ load remaining arg
718___
719$code.=<<___;
720	ldr		$rounds,[$key,#240]
721
722	ldr		$ctr, [$ivp, #12]
723#ifdef __ARMEB__
724	vld1.8		{$dat0},[$ivp]
725#else
726	vld1.32		{$dat0},[$ivp]
727#endif
728	vld1.32		{q8-q9},[$key]		// load key schedule...
729	sub		$rounds,$rounds,#4
730	mov		$step,#16
731	cmp		$len,#2
732	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
733	sub		$rounds,$rounds,#2
734	vld1.32		{q12-q13},[$key_],#32
735	vld1.32		{q14-q15},[$key_],#32
736	vld1.32		{$rndlast},[$key_]
737	add		$key_,$key,#32
738	mov		$cnt,$rounds
739	cclr		$step,lo
740#ifndef __ARMEB__
741	rev		$ctr, $ctr
742#endif
743___
744$code.=<<___	if ($flavour =~ /64/);
745	vorr		$dat1,$dat0,$dat0
746	add		$tctr1, $ctr, #1
747	vorr		$dat2,$dat0,$dat0
748	add		$ctr, $ctr, #2
749	vorr		$ivec,$dat0,$dat0
750	rev		$tctr1, $tctr1
751	vmov.32		${dat1}[3],$tctr1
752	b.ls		.Lctr32_tail
753	rev		$tctr2, $ctr
754	sub		$len,$len,#3		// bias
755	vmov.32		${dat2}[3],$tctr2
756___
757$code.=<<___	if ($flavour !~ /64/);
758	add		$tctr1, $ctr, #1
759	vorr		$ivec,$dat0,$dat0
760	rev		$tctr1, $tctr1
761	vmov.32		${ivec}[3],$tctr1
762	add		$ctr, $ctr, #2
763	vorr		$dat1,$ivec,$ivec
764	b.ls		.Lctr32_tail
765	rev		$tctr2, $ctr
766	vmov.32		${ivec}[3],$tctr2
767	sub		$len,$len,#3		// bias
768	vorr		$dat2,$ivec,$ivec
769___
770$code.=<<___;
771	b		.Loop3x_ctr32
772
773.align	4
774.Loop3x_ctr32:
775	aese		$dat0,q8
776	aesmc		$dat0,$dat0
777	aese		$dat1,q8
778	aesmc		$dat1,$dat1
779	aese		$dat2,q8
780	aesmc		$dat2,$dat2
781	vld1.32		{q8},[$key_],#16
782	subs		$cnt,$cnt,#2
783	aese		$dat0,q9
784	aesmc		$dat0,$dat0
785	aese		$dat1,q9
786	aesmc		$dat1,$dat1
787	aese		$dat2,q9
788	aesmc		$dat2,$dat2
789	vld1.32		{q9},[$key_],#16
790	b.gt		.Loop3x_ctr32
791
792	aese		$dat0,q8
793	aesmc		$tmp0,$dat0
794	aese		$dat1,q8
795	aesmc		$tmp1,$dat1
796	 vld1.8		{$in0},[$inp],#16
797___
798$code.=<<___	if ($flavour =~ /64/);
799	 vorr		$dat0,$ivec,$ivec
800___
801$code.=<<___	if ($flavour !~ /64/);
802	 add		$tctr0,$ctr,#1
803___
804$code.=<<___;
805	aese		$dat2,q8
806	aesmc		$dat2,$dat2
807	 vld1.8		{$in1},[$inp],#16
808___
809$code.=<<___	if ($flavour =~ /64/);
810	 vorr		$dat1,$ivec,$ivec
811___
812$code.=<<___	if ($flavour !~ /64/);
813	 rev		$tctr0,$tctr0
814___
815$code.=<<___;
816	aese		$tmp0,q9
817	aesmc		$tmp0,$tmp0
818	aese		$tmp1,q9
819	aesmc		$tmp1,$tmp1
820	 vld1.8		{$in2},[$inp],#16
821	 mov		$key_,$key
822	aese		$dat2,q9
823	aesmc		$tmp2,$dat2
824___
825$code.=<<___	if ($flavour =~ /64/);
826	 vorr		$dat2,$ivec,$ivec
827	 add		$tctr0,$ctr,#1
828___
829$code.=<<___;
830	aese		$tmp0,q12
831	aesmc		$tmp0,$tmp0
832	aese		$tmp1,q12
833	aesmc		$tmp1,$tmp1
834	 veor		$in0,$in0,$rndlast
835	 add		$tctr1,$ctr,#2
836	aese		$tmp2,q12
837	aesmc		$tmp2,$tmp2
838	 veor		$in1,$in1,$rndlast
839	 add		$ctr,$ctr,#3
840	aese		$tmp0,q13
841	aesmc		$tmp0,$tmp0
842	aese		$tmp1,q13
843	aesmc		$tmp1,$tmp1
844	 veor		$in2,$in2,$rndlast
845___
846$code.=<<___	if ($flavour =~ /64/);
847	 rev		$tctr0,$tctr0
848	aese		$tmp2,q13
849	aesmc		$tmp2,$tmp2
850	 vmov.32	${dat0}[3], $tctr0
851___
852$code.=<<___	if ($flavour !~ /64/);
853	 vmov.32	${ivec}[3], $tctr0
854	aese		$tmp2,q13
855	aesmc		$tmp2,$tmp2
856	 vorr		$dat0,$ivec,$ivec
857___
858$code.=<<___;
859	 rev		$tctr1,$tctr1
860	aese		$tmp0,q14
861	aesmc		$tmp0,$tmp0
862___
863$code.=<<___	if ($flavour !~ /64/);
864	 vmov.32	${ivec}[3], $tctr1
865	 rev		$tctr2,$ctr
866___
867$code.=<<___;
868	aese		$tmp1,q14
869	aesmc		$tmp1,$tmp1
870___
871$code.=<<___	if ($flavour =~ /64/);
872	 vmov.32	${dat1}[3], $tctr1
873	 rev		$tctr2,$ctr
874	aese		$tmp2,q14
875	aesmc		$tmp2,$tmp2
876	 vmov.32	${dat2}[3], $tctr2
877___
878$code.=<<___	if ($flavour !~ /64/);
879	 vorr		$dat1,$ivec,$ivec
880	 vmov.32	${ivec}[3], $tctr2
881	aese		$tmp2,q14
882	aesmc		$tmp2,$tmp2
883	 vorr		$dat2,$ivec,$ivec
884___
885$code.=<<___;
886	 subs		$len,$len,#3
887	aese		$tmp0,q15
888	aese		$tmp1,q15
889	aese		$tmp2,q15
890
891	veor		$in0,$in0,$tmp0
892	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
893	vst1.8		{$in0},[$out],#16
894	veor		$in1,$in1,$tmp1
895	 mov		$cnt,$rounds
896	vst1.8		{$in1},[$out],#16
897	veor		$in2,$in2,$tmp2
898	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
899	vst1.8		{$in2},[$out],#16
900	b.hs		.Loop3x_ctr32
901
902	adds		$len,$len,#3
903	b.eq		.Lctr32_done
904	cmp		$len,#1
905	mov		$step,#16
906	cclr		$step,eq
907
908.Lctr32_tail:
909	aese		$dat0,q8
910	aesmc		$dat0,$dat0
911	aese		$dat1,q8
912	aesmc		$dat1,$dat1
913	vld1.32		{q8},[$key_],#16
914	subs		$cnt,$cnt,#2
915	aese		$dat0,q9
916	aesmc		$dat0,$dat0
917	aese		$dat1,q9
918	aesmc		$dat1,$dat1
919	vld1.32		{q9},[$key_],#16
920	b.gt		.Lctr32_tail
921
922	aese		$dat0,q8
923	aesmc		$dat0,$dat0
924	aese		$dat1,q8
925	aesmc		$dat1,$dat1
926	aese		$dat0,q9
927	aesmc		$dat0,$dat0
928	aese		$dat1,q9
929	aesmc		$dat1,$dat1
930	 vld1.8		{$in0},[$inp],$step
931	aese		$dat0,q12
932	aesmc		$dat0,$dat0
933	aese		$dat1,q12
934	aesmc		$dat1,$dat1
935	 vld1.8		{$in1},[$inp]
936	aese		$dat0,q13
937	aesmc		$dat0,$dat0
938	aese		$dat1,q13
939	aesmc		$dat1,$dat1
940	 veor		$in0,$in0,$rndlast
941	aese		$dat0,q14
942	aesmc		$dat0,$dat0
943	aese		$dat1,q14
944	aesmc		$dat1,$dat1
945	 veor		$in1,$in1,$rndlast
946	aese		$dat0,q15
947	aese		$dat1,q15
948
949	cmp		$len,#1
950	veor		$in0,$in0,$dat0
951	veor		$in1,$in1,$dat1
952	vst1.8		{$in0},[$out],#16
953	b.eq		.Lctr32_done
954	vst1.8		{$in1},[$out]
955
956.Lctr32_done:
957___
958$code.=<<___	if ($flavour !~ /64/);
959	vldmia		sp!,{d8-d15}
960	ldmia		sp!,{r4-r10,pc}
961___
962$code.=<<___	if ($flavour =~ /64/);
963	ldr		x29,[sp],#16
964	ret
965___
966$code.=<<___;
967.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
968___
969}}}
970$code.=<<___;
971#endif
972___
973########################################
974if ($flavour =~ /64/) {			######## 64-bit code
975    my %opcode = (
976	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
977	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
978
979    local *unaes = sub {
980	my ($mnemonic,$arg)=@_;
981
982	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
983	sprintf ".inst\t0x%08x\t//%s %s",
984			$opcode{$mnemonic}|$1|($2<<5),
985			$mnemonic,$arg;
986    };
987
988    foreach(split("\n",$code)) {
989	s/\`([^\`]*)\`/eval($1)/geo;
990
991	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
992	s/@\s/\/\//o;			# old->new style commentary
993
994	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
995	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
996	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
997	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
998	s/vext\.8/ext/o		or
999	s/vrev32\.8/rev32/o	or
1000	s/vtst\.8/cmtst/o	or
1001	s/vshr/ushr/o		or
1002	s/^(\s+)v/$1/o		or	# strip off v prefix
1003	s/\bbx\s+lr\b/ret/o;
1004
1005	# fix up remaining legacy suffixes
1006	s/\.[ui]?8//o;
1007	m/\],#8/o and s/\.16b/\.8b/go;
1008	s/\.[ui]?32//o and s/\.16b/\.4s/go;
1009	s/\.[ui]?64//o and s/\.16b/\.2d/go;
1010	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
1011
1012	print $_,"\n";
1013    }
1014} else {				######## 32-bit code
1015    my %opcode = (
1016	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
1017	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
1018
1019    local *unaes = sub {
1020	my ($mnemonic,$arg)=@_;
1021
1022	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
1023	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
1024					 |(($2&7)<<1) |(($2&8)<<2);
1025	    # since ARMv7 instructions are always encoded little-endian.
1026	    # correct solution is to use .inst directive, but older
1027	    # assemblers don't implement it:-(
1028	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
1029			$word&0xff,($word>>8)&0xff,
1030			($word>>16)&0xff,($word>>24)&0xff,
1031			$mnemonic,$arg;
1032	}
1033    };
1034
1035    sub unvtbl {
1036	my $arg=shift;
1037
1038	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
1039	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
1040		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
1041    }
1042
1043    sub unvdup32 {
1044	my $arg=shift;
1045
1046	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
1047	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
1048    }
1049
1050    sub unvmov32 {
1051	my $arg=shift;
1052
1053	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
1054	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
1055    }
1056
1057    foreach(split("\n",$code)) {
1058	s/\`([^\`]*)\`/eval($1)/geo;
1059
1060	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
1061	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
1062	s/\/\/\s?/@ /o;				# new->old style commentary
1063
1064	# fix up remaining new-style suffixes
1065	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
1066	s/\],#[0-9]+/]!/o;
1067
1068	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
1069	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
1070	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
1071	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
1072	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
1073	s/^(\s+)b\./$1b/o				or
1074	s/^(\s+)mov\./$1mov/o				or
1075	s/^(\s+)ret/$1bx\tlr/o;
1076
1077	print $_,"\n";
1078    }
1079}
1080
1081close STDOUT or die "error closing STDOUT: $!";
1082