1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# Performance in cycles per byte processed with 128-bit key:
31#
32#		CBC enc		CBC dec		CTR
33# Apple A7	2.39		1.20		1.20
34# Cortex-A53	1.32		1.29		1.46
35# Cortex-A57(*)	1.95		0.85		0.93
36# Denver	1.96		0.86		0.80
37# Mongoose	1.33		1.20		1.20
38# Kryo		1.26		0.94		1.00
39#
40# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
41#	and are still same even for updated module;
42
43$flavour = shift;
44$output  = shift;
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49die "can't locate arm-xlate.pl";
50
51open OUT,"| \"$^X\" $xlate $flavour $output";
52*STDOUT=*OUT;
53
54$prefix="aes_v8";
55
56$code=<<___;
57#include "arm_arch.h"
58
59#if __ARM_MAX_ARCH__>=7
60.text
61___
62# $code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
63$code.=<<___						if ($flavour !~ /64/);
64.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
65.fpu	neon
66.code	32
67#undef	__thumb2__
68___
69
70# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
71# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
72# maintain both 32- and 64-bit codes within single module and
73# transliterate common code to either flavour with regex vodoo.
74#
75{{{
76my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
77my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
78	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
79
80
81$code.=<<___;
82.align	5
83.Lrcon:
84.long	0x01,0x01,0x01,0x01
85.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
86.long	0x1b,0x1b,0x1b,0x1b
87
88.globl	${prefix}_set_encrypt_key
89.type	${prefix}_set_encrypt_key,%function
90.align	5
91${prefix}_set_encrypt_key:
92.Lenc_key:
93___
94$code.=<<___	if ($flavour =~ /64/);
95	stp	x29,x30,[sp,#-16]!
96	add	x29,sp,#0
97___
98$code.=<<___;
99	mov	$ptr,#-1
100	cmp	$inp,#0
101	b.eq	.Lenc_key_abort
102	cmp	$out,#0
103	b.eq	.Lenc_key_abort
104	mov	$ptr,#-2
105	cmp	$bits,#128
106	b.lt	.Lenc_key_abort
107	cmp	$bits,#256
108	b.gt	.Lenc_key_abort
109	tst	$bits,#0x3f
110	b.ne	.Lenc_key_abort
111
112	adr	$ptr,.Lrcon
113	cmp	$bits,#192
114
115	veor	$zero,$zero,$zero
116	vld1.8	{$in0},[$inp],#16
117	mov	$bits,#8		// reuse $bits
118	vld1.32	{$rcon,$mask},[$ptr],#32
119
120	b.lt	.Loop128
121	b.eq	.L192
122	b	.L256
123
124.align	4
125.Loop128:
126	vtbl.8	$key,{$in0},$mask
127	vext.8	$tmp,$zero,$in0,#12
128	vst1.32	{$in0},[$out],#16
129	aese	$key,$zero
130	subs	$bits,$bits,#1
131
132	veor	$in0,$in0,$tmp
133	vext.8	$tmp,$zero,$tmp,#12
134	veor	$in0,$in0,$tmp
135	vext.8	$tmp,$zero,$tmp,#12
136	 veor	$key,$key,$rcon
137	veor	$in0,$in0,$tmp
138	vshl.u8	$rcon,$rcon,#1
139	veor	$in0,$in0,$key
140	b.ne	.Loop128
141
142	vld1.32	{$rcon},[$ptr]
143
144	vtbl.8	$key,{$in0},$mask
145	vext.8	$tmp,$zero,$in0,#12
146	vst1.32	{$in0},[$out],#16
147	aese	$key,$zero
148
149	veor	$in0,$in0,$tmp
150	vext.8	$tmp,$zero,$tmp,#12
151	veor	$in0,$in0,$tmp
152	vext.8	$tmp,$zero,$tmp,#12
153	 veor	$key,$key,$rcon
154	veor	$in0,$in0,$tmp
155	vshl.u8	$rcon,$rcon,#1
156	veor	$in0,$in0,$key
157
158	vtbl.8	$key,{$in0},$mask
159	vext.8	$tmp,$zero,$in0,#12
160	vst1.32	{$in0},[$out],#16
161	aese	$key,$zero
162
163	veor	$in0,$in0,$tmp
164	vext.8	$tmp,$zero,$tmp,#12
165	veor	$in0,$in0,$tmp
166	vext.8	$tmp,$zero,$tmp,#12
167	 veor	$key,$key,$rcon
168	veor	$in0,$in0,$tmp
169	veor	$in0,$in0,$key
170	vst1.32	{$in0},[$out]
171	add	$out,$out,#0x50
172
173	mov	$rounds,#10
174	b	.Ldone
175
176.align	4
177.L192:
178	vld1.8	{$in1},[$inp],#8
179	vmov.i8	$key,#8			// borrow $key
180	vst1.32	{$in0},[$out],#16
181	vsub.i8	$mask,$mask,$key	// adjust the mask
182
183.Loop192:
184	vtbl.8	$key,{$in1},$mask
185	vext.8	$tmp,$zero,$in0,#12
186	vst1.32	{$in1},[$out],#8
187	aese	$key,$zero
188	subs	$bits,$bits,#1
189
190	veor	$in0,$in0,$tmp
191	vext.8	$tmp,$zero,$tmp,#12
192	veor	$in0,$in0,$tmp
193	vext.8	$tmp,$zero,$tmp,#12
194	veor	$in0,$in0,$tmp
195
196	vdup.32	$tmp,${in0}[3]
197	veor	$tmp,$tmp,$in1
198	 veor	$key,$key,$rcon
199	vext.8	$in1,$zero,$in1,#12
200	vshl.u8	$rcon,$rcon,#1
201	veor	$in1,$in1,$tmp
202	veor	$in0,$in0,$key
203	veor	$in1,$in1,$key
204	vst1.32	{$in0},[$out],#16
205	b.ne	.Loop192
206
207	mov	$rounds,#12
208	add	$out,$out,#0x20
209	b	.Ldone
210
211.align	4
212.L256:
213	vld1.8	{$in1},[$inp]
214	mov	$bits,#7
215	mov	$rounds,#14
216	vst1.32	{$in0},[$out],#16
217
218.Loop256:
219	vtbl.8	$key,{$in1},$mask
220	vext.8	$tmp,$zero,$in0,#12
221	vst1.32	{$in1},[$out],#16
222	aese	$key,$zero
223	subs	$bits,$bits,#1
224
225	veor	$in0,$in0,$tmp
226	vext.8	$tmp,$zero,$tmp,#12
227	veor	$in0,$in0,$tmp
228	vext.8	$tmp,$zero,$tmp,#12
229	 veor	$key,$key,$rcon
230	veor	$in0,$in0,$tmp
231	vshl.u8	$rcon,$rcon,#1
232	veor	$in0,$in0,$key
233	vst1.32	{$in0},[$out],#16
234	b.eq	.Ldone
235
236	vdup.32	$key,${in0}[3]		// just splat
237	vext.8	$tmp,$zero,$in1,#12
238	aese	$key,$zero
239
240	veor	$in1,$in1,$tmp
241	vext.8	$tmp,$zero,$tmp,#12
242	veor	$in1,$in1,$tmp
243	vext.8	$tmp,$zero,$tmp,#12
244	veor	$in1,$in1,$tmp
245
246	veor	$in1,$in1,$key
247	b	.Loop256
248
249.Ldone:
250	str	$rounds,[$out]
251	mov	$ptr,#0
252
253.Lenc_key_abort:
254	mov	x0,$ptr			// return value
255	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
256	ret
257.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
258
259.globl	${prefix}_set_decrypt_key
260.type	${prefix}_set_decrypt_key,%function
261.align	5
262${prefix}_set_decrypt_key:
263___
264$code.=<<___	if ($flavour =~ /64/);
265	.inst	0xd503233f		// paciasp
266	stp	x29,x30,[sp,#-16]!
267	add	x29,sp,#0
268___
269$code.=<<___	if ($flavour !~ /64/);
270	stmdb	sp!,{r4,lr}
271___
272$code.=<<___;
273	bl	.Lenc_key
274
275	cmp	x0,#0
276	b.ne	.Ldec_key_abort
277
278	sub	$out,$out,#240		// restore original $out
279	mov	x4,#-16
280	add	$inp,$out,x12,lsl#4	// end of key schedule
281
282	vld1.32	{v0.16b},[$out]
283	vld1.32	{v1.16b},[$inp]
284	vst1.32	{v0.16b},[$inp],x4
285	vst1.32	{v1.16b},[$out],#16
286
287.Loop_imc:
288	vld1.32	{v0.16b},[$out]
289	vld1.32	{v1.16b},[$inp]
290	aesimc	v0.16b,v0.16b
291	aesimc	v1.16b,v1.16b
292	vst1.32	{v0.16b},[$inp],x4
293	vst1.32	{v1.16b},[$out],#16
294	cmp	$inp,$out
295	b.hi	.Loop_imc
296
297	vld1.32	{v0.16b},[$out]
298	aesimc	v0.16b,v0.16b
299	vst1.32	{v0.16b},[$inp]
300
301	eor	x0,x0,x0		// return value
302.Ldec_key_abort:
303___
304$code.=<<___	if ($flavour !~ /64/);
305	ldmia	sp!,{r4,pc}
306___
307$code.=<<___	if ($flavour =~ /64/);
308	ldp	x29,x30,[sp],#16
309	.inst	0xd50323bf		// autiasp
310	ret
311___
312$code.=<<___;
313.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
314___
315}}}
316{{{
317sub gen_block () {
318my $dir = shift;
319my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
320my ($inp,$out,$key)=map("x$_",(0..2));
321my $rounds="w3";
322my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
323
324$code.=<<___;
325.globl	${prefix}_${dir}crypt
326.type	${prefix}_${dir}crypt,%function
327.align	5
328${prefix}_${dir}crypt:
329	ldr	$rounds,[$key,#240]
330	vld1.32	{$rndkey0},[$key],#16
331	vld1.8	{$inout},[$inp]
332	sub	$rounds,$rounds,#2
333	vld1.32	{$rndkey1},[$key],#16
334
335.Loop_${dir}c:
336	aes$e	$inout,$rndkey0
337	aes$mc	$inout,$inout
338	vld1.32	{$rndkey0},[$key],#16
339	subs	$rounds,$rounds,#2
340	aes$e	$inout,$rndkey1
341	aes$mc	$inout,$inout
342	vld1.32	{$rndkey1},[$key],#16
343	b.gt	.Loop_${dir}c
344
345	aes$e	$inout,$rndkey0
346	aes$mc	$inout,$inout
347	vld1.32	{$rndkey0},[$key]
348	aes$e	$inout,$rndkey1
349	veor	$inout,$inout,$rndkey0
350
351	vst1.8	{$inout},[$out]
352	ret
353.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
354___
355}
356&gen_block("en");
357&gen_block("de");
358}}}
359{{{
360my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
361my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
362my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
363
364my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
365my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
366
367### q8-q15	preloaded key schedule
368
369$code.=<<___;
370.globl	${prefix}_cbc_encrypt
371.type	${prefix}_cbc_encrypt,%function
372.align	5
373${prefix}_cbc_encrypt:
374___
375$code.=<<___	if ($flavour =~ /64/);
376	stp	x29,x30,[sp,#-16]!
377	add	x29,sp,#0
378___
379$code.=<<___	if ($flavour !~ /64/);
380	mov	ip,sp
381	stmdb	sp!,{r4-r8,lr}
382	vstmdb	sp!,{d8-d15}            @ ABI specification says so
383	ldmia	ip,{r4-r5}		@ load remaining args
384___
385$code.=<<___;
386	subs	$len,$len,#16
387	mov	$step,#16
388	b.lo	.Lcbc_abort
389	cclr	$step,eq
390
391	cmp	$enc,#0			// en- or decrypting?
392	ldr	$rounds,[$key,#240]
393	and	$len,$len,#-16
394	vld1.8	{$ivec},[$ivp]
395	vld1.8	{$dat},[$inp],$step
396
397	vld1.32	{q8-q9},[$key]		// load key schedule...
398	sub	$rounds,$rounds,#6
399	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
400	sub	$rounds,$rounds,#2
401	vld1.32	{q10-q11},[$key_],#32
402	vld1.32	{q12-q13},[$key_],#32
403	vld1.32	{q14-q15},[$key_],#32
404	vld1.32	{$rndlast},[$key_]
405
406	add	$key_,$key,#32
407	mov	$cnt,$rounds
408	b.eq	.Lcbc_dec
409
410	cmp	$rounds,#2
411	veor	$dat,$dat,$ivec
412	veor	$rndzero_n_last,q8,$rndlast
413	b.eq	.Lcbc_enc128
414
415	vld1.32	{$in0-$in1},[$key_]
416	add	$key_,$key,#16
417	add	$key4,$key,#16*4
418	add	$key5,$key,#16*5
419	aese	$dat,q8
420	aesmc	$dat,$dat
421	add	$key6,$key,#16*6
422	add	$key7,$key,#16*7
423	b	.Lenter_cbc_enc
424
425.align	4
426.Loop_cbc_enc:
427	aese	$dat,q8
428	aesmc	$dat,$dat
429	 vst1.8	{$ivec},[$out],#16
430.Lenter_cbc_enc:
431	aese	$dat,q9
432	aesmc	$dat,$dat
433	aese	$dat,$in0
434	aesmc	$dat,$dat
435	vld1.32	{q8},[$key4]
436	cmp	$rounds,#4
437	aese	$dat,$in1
438	aesmc	$dat,$dat
439	vld1.32	{q9},[$key5]
440	b.eq	.Lcbc_enc192
441
442	aese	$dat,q8
443	aesmc	$dat,$dat
444	vld1.32	{q8},[$key6]
445	aese	$dat,q9
446	aesmc	$dat,$dat
447	vld1.32	{q9},[$key7]
448	nop
449
450.Lcbc_enc192:
451	aese	$dat,q8
452	aesmc	$dat,$dat
453	 subs	$len,$len,#16
454	aese	$dat,q9
455	aesmc	$dat,$dat
456	 cclr	$step,eq
457	aese	$dat,q10
458	aesmc	$dat,$dat
459	aese	$dat,q11
460	aesmc	$dat,$dat
461	 vld1.8	{q8},[$inp],$step
462	aese	$dat,q12
463	aesmc	$dat,$dat
464	 veor	q8,q8,$rndzero_n_last
465	aese	$dat,q13
466	aesmc	$dat,$dat
467	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
468	aese	$dat,q14
469	aesmc	$dat,$dat
470	aese	$dat,q15
471	veor	$ivec,$dat,$rndlast
472	b.hs	.Loop_cbc_enc
473
474	vst1.8	{$ivec},[$out],#16
475	b	.Lcbc_done
476
477.align	5
478.Lcbc_enc128:
479	vld1.32	{$in0-$in1},[$key_]
480	aese	$dat,q8
481	aesmc	$dat,$dat
482	b	.Lenter_cbc_enc128
483.Loop_cbc_enc128:
484	aese	$dat,q8
485	aesmc	$dat,$dat
486	 vst1.8	{$ivec},[$out],#16
487.Lenter_cbc_enc128:
488	aese	$dat,q9
489	aesmc	$dat,$dat
490	 subs	$len,$len,#16
491	aese	$dat,$in0
492	aesmc	$dat,$dat
493	 cclr	$step,eq
494	aese	$dat,$in1
495	aesmc	$dat,$dat
496	aese	$dat,q10
497	aesmc	$dat,$dat
498	aese	$dat,q11
499	aesmc	$dat,$dat
500	 vld1.8	{q8},[$inp],$step
501	aese	$dat,q12
502	aesmc	$dat,$dat
503	aese	$dat,q13
504	aesmc	$dat,$dat
505	aese	$dat,q14
506	aesmc	$dat,$dat
507	 veor	q8,q8,$rndzero_n_last
508	aese	$dat,q15
509	veor	$ivec,$dat,$rndlast
510	b.hs	.Loop_cbc_enc128
511
512	vst1.8	{$ivec},[$out],#16
513	b	.Lcbc_done
514___
515{
516my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
517$code.=<<___;
518.align	5
519.Lcbc_dec:
520	vld1.8	{$dat2},[$inp],#16
521	subs	$len,$len,#32		// bias
522	add	$cnt,$rounds,#2
523	vorr	$in1,$dat,$dat
524	vorr	$dat1,$dat,$dat
525	vorr	$in2,$dat2,$dat2
526	b.lo	.Lcbc_dec_tail
527
528	vorr	$dat1,$dat2,$dat2
529	vld1.8	{$dat2},[$inp],#16
530	vorr	$in0,$dat,$dat
531	vorr	$in1,$dat1,$dat1
532	vorr	$in2,$dat2,$dat2
533
534.Loop3x_cbc_dec:
535	aesd	$dat0,q8
536	aesimc	$dat0,$dat0
537	aesd	$dat1,q8
538	aesimc	$dat1,$dat1
539	aesd	$dat2,q8
540	aesimc	$dat2,$dat2
541	vld1.32	{q8},[$key_],#16
542	subs	$cnt,$cnt,#2
543	aesd	$dat0,q9
544	aesimc	$dat0,$dat0
545	aesd	$dat1,q9
546	aesimc	$dat1,$dat1
547	aesd	$dat2,q9
548	aesimc	$dat2,$dat2
549	vld1.32	{q9},[$key_],#16
550	b.gt	.Loop3x_cbc_dec
551
552	aesd	$dat0,q8
553	aesimc	$dat0,$dat0
554	aesd	$dat1,q8
555	aesimc	$dat1,$dat1
556	aesd	$dat2,q8
557	aesimc	$dat2,$dat2
558	 veor	$tmp0,$ivec,$rndlast
559	 subs	$len,$len,#0x30
560	 veor	$tmp1,$in0,$rndlast
561	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
562	aesd	$dat0,q9
563	aesimc	$dat0,$dat0
564	aesd	$dat1,q9
565	aesimc	$dat1,$dat1
566	aesd	$dat2,q9
567	aesimc	$dat2,$dat2
568	 veor	$tmp2,$in1,$rndlast
569	 add	$inp,$inp,x6		// $inp is adjusted in such way that
570					// at exit from the loop $dat1-$dat2
571					// are loaded with last "words"
572	 vorr	$ivec,$in2,$in2
573	 mov	$key_,$key
574	aesd	$dat0,q12
575	aesimc	$dat0,$dat0
576	aesd	$dat1,q12
577	aesimc	$dat1,$dat1
578	aesd	$dat2,q12
579	aesimc	$dat2,$dat2
580	 vld1.8	{$in0},[$inp],#16
581	aesd	$dat0,q13
582	aesimc	$dat0,$dat0
583	aesd	$dat1,q13
584	aesimc	$dat1,$dat1
585	aesd	$dat2,q13
586	aesimc	$dat2,$dat2
587	 vld1.8	{$in1},[$inp],#16
588	aesd	$dat0,q14
589	aesimc	$dat0,$dat0
590	aesd	$dat1,q14
591	aesimc	$dat1,$dat1
592	aesd	$dat2,q14
593	aesimc	$dat2,$dat2
594	 vld1.8	{$in2},[$inp],#16
595	aesd	$dat0,q15
596	aesd	$dat1,q15
597	aesd	$dat2,q15
598	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
599	 add	$cnt,$rounds,#2
600	veor	$tmp0,$tmp0,$dat0
601	veor	$tmp1,$tmp1,$dat1
602	veor	$dat2,$dat2,$tmp2
603	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
604	vst1.8	{$tmp0},[$out],#16
605	 vorr	$dat0,$in0,$in0
606	vst1.8	{$tmp1},[$out],#16
607	 vorr	$dat1,$in1,$in1
608	vst1.8	{$dat2},[$out],#16
609	 vorr	$dat2,$in2,$in2
610	b.hs	.Loop3x_cbc_dec
611
612	cmn	$len,#0x30
613	b.eq	.Lcbc_done
614	nop
615
616.Lcbc_dec_tail:
617	aesd	$dat1,q8
618	aesimc	$dat1,$dat1
619	aesd	$dat2,q8
620	aesimc	$dat2,$dat2
621	vld1.32	{q8},[$key_],#16
622	subs	$cnt,$cnt,#2
623	aesd	$dat1,q9
624	aesimc	$dat1,$dat1
625	aesd	$dat2,q9
626	aesimc	$dat2,$dat2
627	vld1.32	{q9},[$key_],#16
628	b.gt	.Lcbc_dec_tail
629
630	aesd	$dat1,q8
631	aesimc	$dat1,$dat1
632	aesd	$dat2,q8
633	aesimc	$dat2,$dat2
634	aesd	$dat1,q9
635	aesimc	$dat1,$dat1
636	aesd	$dat2,q9
637	aesimc	$dat2,$dat2
638	aesd	$dat1,q12
639	aesimc	$dat1,$dat1
640	aesd	$dat2,q12
641	aesimc	$dat2,$dat2
642	 cmn	$len,#0x20
643	aesd	$dat1,q13
644	aesimc	$dat1,$dat1
645	aesd	$dat2,q13
646	aesimc	$dat2,$dat2
647	 veor	$tmp1,$ivec,$rndlast
648	aesd	$dat1,q14
649	aesimc	$dat1,$dat1
650	aesd	$dat2,q14
651	aesimc	$dat2,$dat2
652	 veor	$tmp2,$in1,$rndlast
653	aesd	$dat1,q15
654	aesd	$dat2,q15
655	b.eq	.Lcbc_dec_one
656	veor	$tmp1,$tmp1,$dat1
657	veor	$tmp2,$tmp2,$dat2
658	 vorr	$ivec,$in2,$in2
659	vst1.8	{$tmp1},[$out],#16
660	vst1.8	{$tmp2},[$out],#16
661	b	.Lcbc_done
662
663.Lcbc_dec_one:
664	veor	$tmp1,$tmp1,$dat2
665	 vorr	$ivec,$in2,$in2
666	vst1.8	{$tmp1},[$out],#16
667
668.Lcbc_done:
669	vst1.8	{$ivec},[$ivp]
670.Lcbc_abort:
671___
672}
673$code.=<<___	if ($flavour !~ /64/);
674	vldmia	sp!,{d8-d15}
675	ldmia	sp!,{r4-r8,pc}
676___
677$code.=<<___	if ($flavour =~ /64/);
678	ldr	x29,[sp],#16
679	ret
680___
681$code.=<<___;
682.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
683___
684}}}
685{{{
686my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
687my ($rounds,$cnt,$key_)=("w5","w6","x7");
688my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
689my $step="x12";		# aliases with $tctr2
690
691my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
692my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
693
694my ($dat,$tmp)=($dat0,$tmp0);
695
696### q8-q15	preloaded key schedule
697
698$code.=<<___;
699.globl	${prefix}_ctr32_encrypt_blocks
700.type	${prefix}_ctr32_encrypt_blocks,%function
701.align	5
702${prefix}_ctr32_encrypt_blocks:
703___
704$code.=<<___	if ($flavour =~ /64/);
705	stp		x29,x30,[sp,#-16]!
706	add		x29,sp,#0
707___
708$code.=<<___	if ($flavour !~ /64/);
709	mov		ip,sp
710	stmdb		sp!,{r4-r10,lr}
711	vstmdb		sp!,{d8-d15}            @ ABI specification says so
712	ldr		r4, [ip]		@ load remaining arg
713___
714$code.=<<___;
715	ldr		$rounds,[$key,#240]
716
717	ldr		$ctr, [$ivp, #12]
718	vld1.32		{$dat0},[$ivp]
719
720	vld1.32		{q8-q9},[$key]		// load key schedule...
721	sub		$rounds,$rounds,#4
722	mov		$step,#16
723	cmp		$len,#2
724	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
725	sub		$rounds,$rounds,#2
726	vld1.32		{q12-q13},[$key_],#32
727	vld1.32		{q14-q15},[$key_],#32
728	vld1.32		{$rndlast},[$key_]
729	add		$key_,$key,#32
730	mov		$cnt,$rounds
731	cclr		$step,lo
732#ifndef __ARMEB__
733	rev		$ctr, $ctr
734#endif
735	vorr		$dat1,$dat0,$dat0
736	add		$tctr1, $ctr, #1
737	vorr		$dat2,$dat0,$dat0
738	add		$ctr, $ctr, #2
739	vorr		$ivec,$dat0,$dat0
740	rev		$tctr1, $tctr1
741	vmov.32		${dat1}[3],$tctr1
742	b.ls		.Lctr32_tail
743	rev		$tctr2, $ctr
744	sub		$len,$len,#3		// bias
745	vmov.32		${dat2}[3],$tctr2
746	b		.Loop3x_ctr32
747
748.align	4
749.Loop3x_ctr32:
750	aese		$dat0,q8
751	aesmc		$dat0,$dat0
752	aese		$dat1,q8
753	aesmc		$dat1,$dat1
754	aese		$dat2,q8
755	aesmc		$dat2,$dat2
756	vld1.32		{q8},[$key_],#16
757	subs		$cnt,$cnt,#2
758	aese		$dat0,q9
759	aesmc		$dat0,$dat0
760	aese		$dat1,q9
761	aesmc		$dat1,$dat1
762	aese		$dat2,q9
763	aesmc		$dat2,$dat2
764	vld1.32		{q9},[$key_],#16
765	b.gt		.Loop3x_ctr32
766
767	aese		$dat0,q8
768	aesmc		$tmp0,$dat0
769	aese		$dat1,q8
770	aesmc		$tmp1,$dat1
771	 vld1.8		{$in0},[$inp],#16
772	 vorr		$dat0,$ivec,$ivec
773	aese		$dat2,q8
774	aesmc		$dat2,$dat2
775	 vld1.8		{$in1},[$inp],#16
776	 vorr		$dat1,$ivec,$ivec
777	aese		$tmp0,q9
778	aesmc		$tmp0,$tmp0
779	aese		$tmp1,q9
780	aesmc		$tmp1,$tmp1
781	 vld1.8		{$in2},[$inp],#16
782	 mov		$key_,$key
783	aese		$dat2,q9
784	aesmc		$tmp2,$dat2
785	 vorr		$dat2,$ivec,$ivec
786	 add		$tctr0,$ctr,#1
787	aese		$tmp0,q12
788	aesmc		$tmp0,$tmp0
789	aese		$tmp1,q12
790	aesmc		$tmp1,$tmp1
791	 veor		$in0,$in0,$rndlast
792	 add		$tctr1,$ctr,#2
793	aese		$tmp2,q12
794	aesmc		$tmp2,$tmp2
795	 veor		$in1,$in1,$rndlast
796	 add		$ctr,$ctr,#3
797	aese		$tmp0,q13
798	aesmc		$tmp0,$tmp0
799	aese		$tmp1,q13
800	aesmc		$tmp1,$tmp1
801	 veor		$in2,$in2,$rndlast
802	 rev		$tctr0,$tctr0
803	aese		$tmp2,q13
804	aesmc		$tmp2,$tmp2
805	 vmov.32	${dat0}[3], $tctr0
806	 rev		$tctr1,$tctr1
807	aese		$tmp0,q14
808	aesmc		$tmp0,$tmp0
809	aese		$tmp1,q14
810	aesmc		$tmp1,$tmp1
811	 vmov.32	${dat1}[3], $tctr1
812	 rev		$tctr2,$ctr
813	aese		$tmp2,q14
814	aesmc		$tmp2,$tmp2
815	 vmov.32	${dat2}[3], $tctr2
816	 subs		$len,$len,#3
817	aese		$tmp0,q15
818	aese		$tmp1,q15
819	aese		$tmp2,q15
820
821	veor		$in0,$in0,$tmp0
822	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
823	vst1.8		{$in0},[$out],#16
824	veor		$in1,$in1,$tmp1
825	 mov		$cnt,$rounds
826	vst1.8		{$in1},[$out],#16
827	veor		$in2,$in2,$tmp2
828	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
829	vst1.8		{$in2},[$out],#16
830	b.hs		.Loop3x_ctr32
831
832	adds		$len,$len,#3
833	b.eq		.Lctr32_done
834	cmp		$len,#1
835	mov		$step,#16
836	cclr		$step,eq
837
838.Lctr32_tail:
839	aese		$dat0,q8
840	aesmc		$dat0,$dat0
841	aese		$dat1,q8
842	aesmc		$dat1,$dat1
843	vld1.32		{q8},[$key_],#16
844	subs		$cnt,$cnt,#2
845	aese		$dat0,q9
846	aesmc		$dat0,$dat0
847	aese		$dat1,q9
848	aesmc		$dat1,$dat1
849	vld1.32		{q9},[$key_],#16
850	b.gt		.Lctr32_tail
851
852	aese		$dat0,q8
853	aesmc		$dat0,$dat0
854	aese		$dat1,q8
855	aesmc		$dat1,$dat1
856	aese		$dat0,q9
857	aesmc		$dat0,$dat0
858	aese		$dat1,q9
859	aesmc		$dat1,$dat1
860	 vld1.8		{$in0},[$inp],$step
861	aese		$dat0,q12
862	aesmc		$dat0,$dat0
863	aese		$dat1,q12
864	aesmc		$dat1,$dat1
865	 vld1.8		{$in1},[$inp]
866	aese		$dat0,q13
867	aesmc		$dat0,$dat0
868	aese		$dat1,q13
869	aesmc		$dat1,$dat1
870	 veor		$in0,$in0,$rndlast
871	aese		$dat0,q14
872	aesmc		$dat0,$dat0
873	aese		$dat1,q14
874	aesmc		$dat1,$dat1
875	 veor		$in1,$in1,$rndlast
876	aese		$dat0,q15
877	aese		$dat1,q15
878
879	cmp		$len,#1
880	veor		$in0,$in0,$dat0
881	veor		$in1,$in1,$dat1
882	vst1.8		{$in0},[$out],#16
883	b.eq		.Lctr32_done
884	vst1.8		{$in1},[$out]
885
886.Lctr32_done:
887___
888$code.=<<___	if ($flavour !~ /64/);
889	vldmia		sp!,{d8-d15}
890	ldmia		sp!,{r4-r10,pc}
891___
892$code.=<<___	if ($flavour =~ /64/);
893	ldr		x29,[sp],#16
894	ret
895___
896$code.=<<___;
897.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
898___
899}}}
900$code.=<<___;
901#endif
902___
903########################################
904if ($flavour =~ /64/) {			######## 64-bit code
905    my %opcode = (
906	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
907	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
908
909    local *unaes = sub {
910	my ($mnemonic,$arg)=@_;
911
912	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
913	sprintf ".inst\t0x%08x\t//%s %s",
914			$opcode{$mnemonic}|$1|($2<<5),
915			$mnemonic,$arg;
916    };
917
918    foreach(split("\n",$code)) {
919	s/\`([^\`]*)\`/eval($1)/geo;
920
921	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
922	s/@\s/\/\//o;			# old->new style commentary
923
924	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
925	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
926	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
927	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
928	s/vext\.8/ext/o		or
929	s/vrev32\.8/rev32/o	or
930	s/vtst\.8/cmtst/o	or
931	s/vshr/ushr/o		or
932	s/^(\s+)v/$1/o		or	# strip off v prefix
933	s/\bbx\s+lr\b/ret/o;
934
935	# fix up remaining legacy suffixes
936	s/\.[ui]?8//o;
937	m/\],#8/o and s/\.16b/\.8b/go;
938	s/\.[ui]?32//o and s/\.16b/\.4s/go;
939	s/\.[ui]?64//o and s/\.16b/\.2d/go;
940	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
941
942	print $_,"\n";
943    }
944} else {				######## 32-bit code
945    my %opcode = (
946	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
947	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
948
949    local *unaes = sub {
950	my ($mnemonic,$arg)=@_;
951
952	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
953	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
954					 |(($2&7)<<1) |(($2&8)<<2);
955	    # since ARMv7 instructions are always encoded little-endian.
956	    # correct solution is to use .inst directive, but older
957	    # assemblers don't implement it:-(
958	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
959			$word&0xff,($word>>8)&0xff,
960			($word>>16)&0xff,($word>>24)&0xff,
961			$mnemonic,$arg;
962	}
963    };
964
965    sub unvtbl {
966	my $arg=shift;
967
968	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
969	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
970		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
971    }
972
973    sub unvdup32 {
974	my $arg=shift;
975
976	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
977	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
978    }
979
980    sub unvmov32 {
981	my $arg=shift;
982
983	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
984	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
985    }
986
987    foreach(split("\n",$code)) {
988	s/\`([^\`]*)\`/eval($1)/geo;
989
990	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
991	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
992	s/\/\/\s?/@ /o;				# new->old style commentary
993
994	# fix up remaining new-style suffixes
995	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
996	s/\],#[0-9]+/]!/o;
997
998	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
999	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
1000	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
1001	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
1002	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
1003	s/^(\s+)b\./$1b/o				or
1004	s/^(\s+)mov\./$1mov/o				or
1005	s/^(\s+)ret/$1bx\tlr/o;
1006
1007	print $_,"\n";
1008    }
1009}
1010
1011close STDOUT or die "error closing STDOUT: $!";
1012