1#! /usr/bin/env perl
2# Copyright 2015-2019 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10######################################################################
11## Constant-time SSSE3 AES core implementation.
12## version 0.1
13##
14## By Mike Hamburg (Stanford University), 2009
15## Public domain.
16##
17## For details see http://shiftleft.org/papers/vector_aes/ and
18## http://crypto.stanford.edu/vpaes/.
19##
20######################################################################
21# ARMv8 NEON adaptation by <appro@openssl.org>
22#
23# Reason for undertaken effort is that there is at least one popular
24# SoC based on Cortex-A53 that doesn't have crypto extensions.
25#
26#                   CBC enc     ECB enc/dec(*)   [bit-sliced enc/dec]
27# Cortex-A53        21.5        18.1/20.6        [17.5/19.8         ]
28# Cortex-A57        36.0(**)    20.4/24.9(**)    [14.4/16.6         ]
29# X-Gene            45.9(**)    45.8/57.7(**)    [33.1/37.6(**)     ]
30# Denver(***)       16.6(**)    15.1/17.8(**)    [8.80/9.93         ]
31# Apple A7(***)     22.7(**)    10.9/14.3        [8.45/10.0         ]
32# Mongoose(***)     26.3(**)    21.0/25.0(**)    [13.3/16.8         ]
33#
34# (*)	ECB denotes approximate result for parallelizable modes
35#	such as CBC decrypt, CTR, etc.;
36# (**)	these results are worse than scalar compiler-generated
37#	code, but it's constant-time and therefore preferred;
38# (***)	presented for reference/comparison purposes;
39
40$flavour = shift;
41while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
42
43$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
44( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
45( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
46die "can't locate arm-xlate.pl";
47
48open OUT,"| \"$^X\" $xlate $flavour $output";
49*STDOUT=*OUT;
50
51$code.=<<___;
52.text
53
54.type	_vpaes_consts,%object
55.align	7	// totally strategic alignment
56_vpaes_consts:
57.Lk_mc_forward:	// mc_forward
58	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
59	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
60	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
61	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
62.Lk_mc_backward:// mc_backward
63	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
64	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
65	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
66	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
67.Lk_sr:		// sr
68	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
69	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
70	.quad	0x0F060D040B020900, 0x070E050C030A0108
71	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
72
73//
74// "Hot" constants
75//
76.Lk_inv:	// inv, inva
77	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
78	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
79.Lk_ipt:	// input transform (lo, hi)
80	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
81	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
82.Lk_sbo:	// sbou, sbot
83	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
84	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
85.Lk_sb1:	// sb1u, sb1t
86	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
87	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
88.Lk_sb2:	// sb2u, sb2t
89	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
90	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
91
92//
93//  Decryption stuff
94//
95.Lk_dipt:	// decryption input transform
96	.quad	0x0F505B040B545F00, 0x154A411E114E451A
97	.quad	0x86E383E660056500, 0x12771772F491F194
98.Lk_dsbo:	// decryption sbox final output
99	.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
100	.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
101.Lk_dsb9:	// decryption sbox output *9*u, *9*t
102	.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
103	.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
104.Lk_dsbd:	// decryption sbox output *D*u, *D*t
105	.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
106	.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
107.Lk_dsbb:	// decryption sbox output *B*u, *B*t
108	.quad	0xD022649296B44200, 0x602646F6B0F2D404
109	.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
110.Lk_dsbe:	// decryption sbox output *E*u, *E*t
111	.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
112	.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
113
114//
115//  Key schedule constants
116//
117.Lk_dksd:	// decryption key schedule: invskew x*D
118	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
119	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
120.Lk_dksb:	// decryption key schedule: invskew x*B
121	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
122	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
123.Lk_dkse:	// decryption key schedule: invskew x*E + 0x63
124	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
125	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
126.Lk_dks9:	// decryption key schedule: invskew x*9
127	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
128	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
129
130.Lk_rcon:	// rcon
131	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
132
133.Lk_opt:	// output transform
134	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
135	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
136.Lk_deskew:	// deskew tables: inverts the sbox's "skew"
137	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
138	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
139
140.asciz  "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
141.size	_vpaes_consts,.-_vpaes_consts
142.align	6
143___
144
145{
146my ($inp,$out,$key) = map("x$_",(0..2));
147
148my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
149my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
150my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
151
152$code.=<<___;
153##
154##  _aes_preheat
155##
156##  Fills register %r10 -> .aes_consts (so you can -fPIC)
157##  and %xmm9-%xmm15 as specified below.
158##
159.type	_vpaes_encrypt_preheat,%function
160.align	4
161_vpaes_encrypt_preheat:
162	adr	x10, .Lk_inv
163	movi	v17.16b, #0x0f
164	ld1	{v18.2d-v19.2d}, [x10],#32	// .Lk_inv
165	ld1	{v20.2d-v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
166	ld1	{v24.2d-v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
167	ret
168.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
169
170##
171##  _aes_encrypt_core
172##
173##  AES-encrypt %xmm0.
174##
175##  Inputs:
176##     %xmm0 = input
177##     %xmm9-%xmm15 as in _vpaes_preheat
178##    (%rdx) = scheduled keys
179##
180##  Output in %xmm0
181##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
182##  Preserves %xmm6 - %xmm8 so you get some local vectors
183##
184##
185.type	_vpaes_encrypt_core,%function
186.align 4
187_vpaes_encrypt_core:
188	mov	x9, $key
189	ldr	w8, [$key,#240]			// pull rounds
190	adr	x11, .Lk_mc_forward+16
191						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
192	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
193	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
194	ushr	v0.16b, v7.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
195	tbl	v1.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
196						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
197	tbl	v2.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
198	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
199	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
200	b	.Lenc_entry
201
202.align 4
203.Lenc_loop:
204	// middle of middle round
205	add	x10, x11, #0x40
206	tbl	v4.16b, {$sb1t}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
207	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
208	tbl	v0.16b, {$sb1u}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
209	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
210	tbl	v5.16b,	{$sb2t}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
211	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
212	tbl	v2.16b, {$sb2u}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
213	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
214	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
215	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
216	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
217	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
218	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
219	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
220	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
221	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
222	sub	w8, w8, #1			// nr--
223
224.Lenc_entry:
225	// top of round
226	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
227	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
228	tbl	v5.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
229	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
230	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
231	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
232	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
233	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
234	tbl	v2.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
235	tbl	v3.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
236	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
237	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
238	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
239	cbnz	w8, .Lenc_loop
240
241	// middle of last round
242	add	x10, x11, #0x80
243						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
244						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
245	tbl	v4.16b, {$sbou}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
246	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
247	tbl	v0.16b, {$sbot}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
248	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
249	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
250	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
251	ret
252.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
253
254.globl	vpaes_encrypt
255.type	vpaes_encrypt,%function
256.align	4
257vpaes_encrypt:
258	.inst	0xd503233f			// paciasp
259	stp	x29,x30,[sp,#-16]!
260	add	x29,sp,#0
261
262	ld1	{v7.16b}, [$inp]
263	bl	_vpaes_encrypt_preheat
264	bl	_vpaes_encrypt_core
265	st1	{v0.16b}, [$out]
266
267	ldp	x29,x30,[sp],#16
268	.inst	0xd50323bf			// autiasp
269	ret
270.size	vpaes_encrypt,.-vpaes_encrypt
271
272.type	_vpaes_encrypt_2x,%function
273.align 4
274_vpaes_encrypt_2x:
275	mov	x9, $key
276	ldr	w8, [$key,#240]			// pull rounds
277	adr	x11, .Lk_mc_forward+16
278						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
279	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
280	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
281	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	\$4,	%xmm0,	%xmm0
282	 and	v9.16b,  v15.16b,  v17.16b
283	 ushr	v8.16b,  v15.16b,  #4
284	tbl	v1.16b,  {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
285	 tbl	v9.16b,  {$iptlo}, v9.16b
286						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
287	tbl	v2.16b,  {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
288	 tbl	v10.16b, {$ipthi}, v8.16b
289	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
290	 eor	v8.16b,  v9.16b,   v16.16b
291	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
292	 eor	v8.16b,  v8.16b,   v10.16b
293	b	.Lenc_2x_entry
294
295.align 4
296.Lenc_2x_loop:
297	// middle of middle round
298	add	x10, x11, #0x40
299	tbl	v4.16b,  {$sb1t}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
300	 tbl	v12.16b, {$sb1t}, v10.16b
301	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
302	tbl	v0.16b,  {$sb1u}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
303	 tbl	v8.16b,  {$sb1u}, v11.16b
304	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
305	 eor	v12.16b, v12.16b, v16.16b
306	tbl	v5.16b,	 {$sb2t}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
307	 tbl	v13.16b, {$sb2t}, v10.16b
308	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
309	 eor	v8.16b,  v8.16b,  v12.16b
310	tbl	v2.16b,  {$sb2u}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
311	 tbl	v10.16b, {$sb2u}, v11.16b
312	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
313	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
314	 tbl	v11.16b, {v8.16b}, v1.16b
315	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
316	 eor	v10.16b, v10.16b, v13.16b
317	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
318	 tbl	v8.16b,  {v8.16b}, v4.16b
319	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
320	 eor	v11.16b, v11.16b, v10.16b
321	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
322	 tbl	v12.16b, {v11.16b},v1.16b
323	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
324	 eor	v8.16b,  v8.16b,  v11.16b
325	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
326	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
327	 eor	v8.16b,  v8.16b,  v12.16b
328	sub	w8, w8, #1			// nr--
329
330.Lenc_2x_entry:
331	// top of round
332	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
333	ushr	v0.16b,  v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
334	 and	v9.16b,  v8.16b, v17.16b
335	 ushr	v8.16b,  v8.16b, #4
336	tbl	v5.16b,  {$invhi},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
337	 tbl	v13.16b, {$invhi},v9.16b
338	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
339	 eor	v9.16b,  v9.16b,  v8.16b
340	tbl	v3.16b,  {$invlo},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
341	 tbl	v11.16b, {$invlo},v8.16b
342	tbl	v4.16b,  {$invlo},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
343	 tbl	v12.16b, {$invlo},v9.16b
344	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
345	 eor	v11.16b, v11.16b, v13.16b
346	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
347	 eor	v12.16b, v12.16b, v13.16b
348	tbl	v2.16b,  {$invlo},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
349	 tbl	v10.16b, {$invlo},v11.16b
350	tbl	v3.16b,  {$invlo},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
351	 tbl	v11.16b, {$invlo},v12.16b
352	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
353	 eor	v10.16b, v10.16b, v9.16b
354	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
355	 eor	v11.16b, v11.16b, v8.16b
356	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
357	cbnz	w8, .Lenc_2x_loop
358
359	// middle of last round
360	add	x10, x11, #0x80
361						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
362						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
363	tbl	v4.16b,  {$sbou}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
364	 tbl	v12.16b, {$sbou}, v10.16b
365	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
366	tbl	v0.16b,  {$sbot}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
367	 tbl	v8.16b,  {$sbot}, v11.16b
368	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
369	 eor	v12.16b, v12.16b, v16.16b
370	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
371	 eor	v8.16b,  v8.16b,  v12.16b
372	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
373	 tbl	v1.16b,  {v8.16b},v1.16b
374	ret
375.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
376
377.type	_vpaes_decrypt_preheat,%function
378.align	4
379_vpaes_decrypt_preheat:
380	adr	x10, .Lk_inv
381	movi	v17.16b, #0x0f
382	adr	x11, .Lk_dipt
383	ld1	{v18.2d-v19.2d}, [x10],#32	// .Lk_inv
384	ld1	{v20.2d-v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
385	ld1	{v24.2d-v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
386	ld1	{v28.2d-v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
387	ret
388.size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
389
390##
391##  Decryption core
392##
393##  Same API as encryption core.
394##
395.type	_vpaes_decrypt_core,%function
396.align	4
397_vpaes_decrypt_core:
398	mov	x9, $key
399	ldr	w8, [$key,#240]			// pull rounds
400
401						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
402	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	\$4, %r11
403	eor	x11, x11, #0x30			// xor		\$0x30,	%r11
404	adr	x10, .Lk_sr
405	and	x11, x11, #0x30			// and		\$0x30,	%r11
406	add	x11, x11, x10
407	adr	x10, .Lk_mc_forward+48
408
409	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
410	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
411	ushr	v0.16b, v7.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
412	tbl	v2.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
413	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
414						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
415	tbl	v0.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
416	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
417	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
418	b	.Ldec_entry
419
420.align 4
421.Ldec_loop:
422//
423//  Inverse mix columns
424//
425						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
426						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
427	tbl	v4.16b, {$sb9u}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
428	tbl	v1.16b, {$sb9t}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
429	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
430						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
431	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
432						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
433
434	tbl	v4.16b, {$sbdu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
435	tbl 	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
436	tbl	v1.16b, {$sbdt}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
437	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
438						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
439	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
440						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
441
442	tbl	v4.16b, {$sbbu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
443	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
444	tbl	v1.16b, {$sbbt}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
445	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
446						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
447	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
448						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
449
450	tbl	v4.16b, {$sbeu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
451	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
452	tbl	v1.16b, {$sbet}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
453	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
454	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr \$12,	%xmm5,	%xmm5,	%xmm5
455	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
456	sub	w8, w8, #1			// sub		\$1,%rax			# nr--
457
458.Ldec_entry:
459	// top of round
460	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
461	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
462	tbl	v2.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
463	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
464	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
465	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
466	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
467	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
468	tbl	v2.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
469	tbl	v3.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
470	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
471	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
472	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
473	cbnz	w8, .Ldec_loop
474
475	// middle of last round
476						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
477	tbl	v4.16b, {$sbou}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
478						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
479	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
480	tbl	v1.16b, {$sbot}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
481	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
482	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
483	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
484	ret
485.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
486
487.globl	vpaes_decrypt
488.type	vpaes_decrypt,%function
489.align	4
490vpaes_decrypt:
491	.inst	0xd503233f			// paciasp
492	stp	x29,x30,[sp,#-16]!
493	add	x29,sp,#0
494
495	ld1	{v7.16b}, [$inp]
496	bl	_vpaes_decrypt_preheat
497	bl	_vpaes_decrypt_core
498	st1	{v0.16b}, [$out]
499
500	ldp	x29,x30,[sp],#16
501	.inst	0xd50323bf			// autiasp
502	ret
503.size	vpaes_decrypt,.-vpaes_decrypt
504
505// v14-v15 input, v0-v1 output
506.type	_vpaes_decrypt_2x,%function
507.align	4
508_vpaes_decrypt_2x:
509	mov	x9, $key
510	ldr	w8, [$key,#240]			// pull rounds
511
512						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
513	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	\$4, %r11
514	eor	x11, x11, #0x30			// xor		\$0x30,	%r11
515	adr	x10, .Lk_sr
516	and	x11, x11, #0x30			// and		\$0x30,	%r11
517	add	x11, x11, x10
518	adr	x10, .Lk_mc_forward+48
519
520	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
521	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
522	ushr	v0.16b,  v14.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
523	 and	v9.16b,  v15.16b, v17.16b
524	 ushr	v8.16b,  v15.16b, #4
525	tbl	v2.16b,  {$iptlo},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
526	 tbl	v10.16b, {$iptlo},v9.16b
527	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
528						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
529	tbl	v0.16b,  {$ipthi},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
530	 tbl	v8.16b,  {$ipthi},v8.16b
531	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
532	 eor	v10.16b, v10.16b, v16.16b
533	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
534	 eor	v8.16b,  v8.16b,  v10.16b
535	b	.Ldec_2x_entry
536
537.align 4
538.Ldec_2x_loop:
539//
540//  Inverse mix columns
541//
542						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
543						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
544	tbl	v4.16b,  {$sb9u}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
545	 tbl	v12.16b, {$sb9u}, v10.16b
546	tbl	v1.16b,  {$sb9t}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
547	 tbl	v9.16b,  {$sb9t}, v11.16b
548	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
549	 eor	v8.16b,  v12.16b, v16.16b
550						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
551	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
552	 eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
553						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
554
555	tbl	v4.16b,  {$sbdu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
556	 tbl	v12.16b, {$sbdu}, v10.16b
557	tbl 	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
558	 tbl 	v8.16b,  {v8.16b},v5.16b
559	tbl	v1.16b,  {$sbdt}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
560	 tbl	v9.16b,  {$sbdt}, v11.16b
561	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
562	 eor	v8.16b,  v8.16b,  v12.16b
563						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
564	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
565	 eor	v8.16b,  v8.16b,  v9.16b
566						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
567
568	tbl	v4.16b,  {$sbbu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
569	 tbl	v12.16b, {$sbbu}, v10.16b
570	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
571	 tbl	v8.16b,  {v8.16b},v5.16b
572	tbl	v1.16b,  {$sbbt}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
573	 tbl	v9.16b,  {$sbbt}, v11.16b
574	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
575	 eor	v8.16b,  v8.16b,  v12.16b
576						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
577	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
578	 eor	v8.16b,  v8.16b,  v9.16b
579						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
580
581	tbl	v4.16b,  {$sbeu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
582	 tbl	v12.16b, {$sbeu}, v10.16b
583	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
584	 tbl	v8.16b,  {v8.16b},v5.16b
585	tbl	v1.16b,  {$sbet}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
586	 tbl	v9.16b,  {$sbet}, v11.16b
587	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
588	 eor	v8.16b,  v8.16b,  v12.16b
589	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr \$12,	%xmm5,	%xmm5,	%xmm5
590	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
591	 eor	v8.16b,  v8.16b,  v9.16b
592	sub	w8, w8, #1			// sub		\$1,%rax			# nr--
593
594.Ldec_2x_entry:
595	// top of round
596	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
597	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
598	 and	v9.16b,  v8.16b,  v17.16b
599	 ushr	v8.16b,  v8.16b,  #4
600	tbl	v2.16b,  {$invhi},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
601	 tbl	v10.16b, {$invhi},v9.16b
602	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
603	 eor	v9.16b,	 v9.16b,  v8.16b
604	tbl	v3.16b,  {$invlo},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
605	 tbl	v11.16b, {$invlo},v8.16b
606	tbl	v4.16b,  {$invlo},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
607	 tbl	v12.16b, {$invlo},v9.16b
608	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
609	 eor	v11.16b, v11.16b, v10.16b
610	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
611	 eor	v12.16b, v12.16b, v10.16b
612	tbl	v2.16b,  {$invlo},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
613	 tbl	v10.16b, {$invlo},v11.16b
614	tbl	v3.16b,  {$invlo},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
615	 tbl	v11.16b, {$invlo},v12.16b
616	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
617	 eor	v10.16b, v10.16b, v9.16b
618	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
619	 eor	v11.16b, v11.16b, v8.16b
620	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
621	cbnz	w8, .Ldec_2x_loop
622
623	// middle of last round
624						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
625	tbl	v4.16b,  {$sbou}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
626	 tbl	v12.16b, {$sbou}, v10.16b
627						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
628	tbl	v1.16b,  {$sbot}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
629	 tbl	v9.16b,  {$sbot}, v11.16b
630	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
631	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
632	 eor	v12.16b, v12.16b, v16.16b
633	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
634	 eor	v8.16b,  v9.16b,  v12.16b
635	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
636	 tbl	v1.16b,  {v8.16b},v2.16b
637	ret
638.size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
639___
640}
641{
642my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
643my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
644
645$code.=<<___;
646########################################################
647##                                                    ##
648##                  AES key schedule                  ##
649##                                                    ##
650########################################################
651.type	_vpaes_key_preheat,%function
652.align	4
653_vpaes_key_preheat:
654	adr	x10, .Lk_inv
655	movi	v16.16b, #0x5b			// .Lk_s63
656	adr	x11, .Lk_sb1
657	movi	v17.16b, #0x0f			// .Lk_s0F
658	ld1	{v18.2d-v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
659	adr	x10, .Lk_dksd
660	ld1	{v22.2d-v23.2d}, [x11]		// .Lk_sb1
661	adr	x11, .Lk_mc_forward
662	ld1	{v24.2d-v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
663	ld1	{v28.2d-v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
664	ld1	{v8.2d}, [x10]			// .Lk_rcon
665	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
666	ret
667.size	_vpaes_key_preheat,.-_vpaes_key_preheat
668
669.type	_vpaes_schedule_core,%function
670.align	4
671_vpaes_schedule_core:
672	.inst	0xd503233f			// paciasp
673	stp	x29, x30, [sp,#-16]!
674	add	x29,sp,#0
675
676	bl	_vpaes_key_preheat		// load the tables
677
678	ld1	{v0.16b}, [$inp],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
679
680	// input transform
681	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
682	bl	_vpaes_schedule_transform
683	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
684
685	adr	x10, .Lk_sr			// lea	.Lk_sr(%rip),%r10
686	add	x8, x8, x10
687	cbnz	$dir, .Lschedule_am_decrypting
688
689	// encrypting, output zeroth round key after transform
690	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)
691	b	.Lschedule_go
692
693.Lschedule_am_decrypting:
694	// decrypting, output zeroth round key after shiftrows
695	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
696	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
697	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
698	eor	x8, x8, #0x30			// xor	\$0x30, %r8
699
700.Lschedule_go:
701	cmp	$bits, #192			// cmp	\$192,	%esi
702	b.hi	.Lschedule_256
703	b.eq	.Lschedule_192
704	// 128: fall though
705
706##
707##  .schedule_128
708##
709##  128-bit specific part of key schedule.
710##
711##  This schedule is really simple, because all its parts
712##  are accomplished by the subroutines.
713##
714.Lschedule_128:
715	mov	$inp, #10			// mov	\$10, %esi
716
717.Loop_schedule_128:
718	sub	$inp, $inp, #1			// dec	%esi
719	bl 	_vpaes_schedule_round
720	cbz 	$inp, .Lschedule_mangle_last
721	bl	_vpaes_schedule_mangle		// write output
722	b 	.Loop_schedule_128
723
724##
725##  .aes_schedule_192
726##
727##  192-bit specific part of key schedule.
728##
729##  The main body of this schedule is the same as the 128-bit
730##  schedule, but with more smearing.  The long, high side is
731##  stored in %xmm7 as before, and the short, low side is in
732##  the high bits of %xmm6.
733##
734##  This schedule is somewhat nastier, however, because each
735##  round produces 192 bits of key material, or 1.5 round keys.
736##  Therefore, on each cycle we do 2 rounds and produce 3 round
737##  keys.
738##
739.align	4
740.Lschedule_192:
741	sub	$inp, $inp, #8
742	ld1	{v0.16b}, [$inp]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
743	bl	_vpaes_schedule_transform	// input transform
744	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
745	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
746	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
747	mov	$inp, #4			// mov	\$4,	%esi
748
749.Loop_schedule_192:
750	sub	$inp, $inp, #1			// dec	%esi
751	bl	_vpaes_schedule_round
752	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	\$8,%xmm6,%xmm0,%xmm0
753	bl	_vpaes_schedule_mangle		// save key n
754	bl	_vpaes_schedule_192_smear
755	bl	_vpaes_schedule_mangle		// save key n+1
756	bl	_vpaes_schedule_round
757	cbz 	$inp, .Lschedule_mangle_last
758	bl	_vpaes_schedule_mangle		// save key n+2
759	bl	_vpaes_schedule_192_smear
760	b	.Loop_schedule_192
761
762##
763##  .aes_schedule_256
764##
765##  256-bit specific part of key schedule.
766##
767##  The structure here is very similar to the 128-bit
768##  schedule, but with an additional "low side" in
769##  %xmm6.  The low side's rounds are the same as the
770##  high side's, except no rcon and no rotation.
771##
772.align	4
773.Lschedule_256:
774	ld1	{v0.16b}, [$inp]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
775	bl	_vpaes_schedule_transform	// input transform
776	mov	$inp, #7			// mov	\$7, %esi
777
778.Loop_schedule_256:
779	sub	$inp, $inp, #1			// dec	%esi
780	bl	_vpaes_schedule_mangle		// output low result
781	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
782
783	// high round
784	bl	_vpaes_schedule_round
785	cbz 	$inp, .Lschedule_mangle_last
786	bl	_vpaes_schedule_mangle
787
788	// low round. swap xmm7 and xmm6
789	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
790	movi	v4.16b, #0
791	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
792	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
793	bl	_vpaes_schedule_low_round
794	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
795
796	b	.Loop_schedule_256
797
798##
799##  .aes_schedule_mangle_last
800##
801##  Mangler for last round of key schedule
802##  Mangles %xmm0
803##    when encrypting, outputs out(%xmm0) ^ 63
804##    when decrypting, outputs unskew(%xmm0)
805##
806##  Always called right before return... jumps to cleanup and exits
807##
808.align	4
809.Lschedule_mangle_last:
810	// schedule last round key from xmm0
811	adr	x11, .Lk_deskew			// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
812	cbnz	$dir, .Lschedule_mangle_last_dec
813
814	// encrypting
815	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
816	adr	x11, .Lk_opt			// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
817	add	$out, $out, #32			// add	\$32,	%rdx
818	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
819
820.Lschedule_mangle_last_dec:
821	ld1	{v20.2d-v21.2d}, [x11]		// reload constants
822	sub	$out, $out, #16			// add	\$-16,	%rdx
823	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
824	bl	_vpaes_schedule_transform	// output transform
825	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)		# save last key
826
827	// cleanup
828	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
829	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
830	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
831	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
832	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
833	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
834	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
835	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
836	ldp	x29, x30, [sp],#16
837	.inst	0xd50323bf			// autiasp
838	ret
839.size	_vpaes_schedule_core,.-_vpaes_schedule_core
840
841##
842##  .aes_schedule_192_smear
843##
844##  Smear the short, low side in the 192-bit key schedule.
845##
846##  Inputs:
847##    %xmm7: high side, b  a  x  y
848##    %xmm6:  low side, d  c  0  0
849##    %xmm13: 0
850##
851##  Outputs:
852##    %xmm6: b+c+d  b+c  0  0
853##    %xmm0: b+c+d  b+c  b  a
854##
855.type	_vpaes_schedule_192_smear,%function
856.align	4
857_vpaes_schedule_192_smear:
858	movi	v1.16b, #0
859	dup	v0.4s, v7.s[3]
860	ins	v1.s[3], v6.s[2]	// vpshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
861	ins	v0.s[0], v7.s[2]	// vpshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
862	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
863	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
864	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
865	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
866	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
867	ret
868.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
869
870##
871##  .aes_schedule_round
872##
873##  Runs one main round of the key schedule on %xmm0, %xmm7
874##
875##  Specifically, runs subbytes on the high dword of %xmm0
876##  then rotates it by one byte and xors into the low dword of
877##  %xmm7.
878##
879##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
880##  next rcon.
881##
882##  Smears the dwords of %xmm7 by xoring the low into the
883##  second low, result into third, result into highest.
884##
885##  Returns results in %xmm7 = %xmm0.
886##  Clobbers %xmm1-%xmm4, %r11.
887##
888.type	_vpaes_schedule_round,%function
889.align	4
890_vpaes_schedule_round:
891	// extract rcon from xmm8
892	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
893	ext	v1.16b, $rcon, v4.16b, #15	// vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
894	ext	$rcon, $rcon, $rcon, #15	// vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
895	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
896
897	// rotate
898	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
899	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
900
901	// fall through...
902
903	// low round: same as high round, but no rotation and no rcon.
904_vpaes_schedule_low_round:
905	// smear xmm7
906	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	\$4,	%xmm7,	%xmm1
907	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
908	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	\$8,	%xmm7,	%xmm4
909
910	// subbytes
911	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
912	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
913	 eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
914	tbl	v2.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
915	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
916	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
917	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
918	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
919	 eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
920	tbl	v3.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
921	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
922	tbl	v2.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
923	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
924	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
925	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
926	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
927	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
928
929	// add in smeared stuff
930	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
931	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
932	ret
933.size	_vpaes_schedule_round,.-_vpaes_schedule_round
934
935##
936##  .aes_schedule_transform
937##
938##  Linear-transform %xmm0 according to tables at (%r11)
939##
940##  Requires that %xmm9 = 0x0F0F... as in preheat
941##  Output in %xmm0
942##  Clobbers %xmm1, %xmm2
943##
944.type	_vpaes_schedule_transform,%function
945.align	4
946_vpaes_schedule_transform:
947	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
948	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
949						// vmovdqa	(%r11),	%xmm2 	# lo
950	tbl	v2.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
951						// vmovdqa	16(%r11),	%xmm1 # hi
952	tbl	v0.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
953	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
954	ret
955.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
956
957##
958##  .aes_schedule_mangle
959##
960##  Mangle xmm0 from (basis-transformed) standard version
961##  to our version.
962##
963##  On encrypt,
964##    xor with 0x63
965##    multiply by circulant 0,1,1,1
966##    apply shiftrows transform
967##
968##  On decrypt,
969##    xor with 0x63
970##    multiply by "inverse mixcolumns" circulant E,B,D,9
971##    deskew
972##    apply shiftrows transform
973##
974##
975##  Writes out to (%rdx), and increments or decrements it
976##  Keeps track of round number mod 4 in %r8
977##  Preserves xmm0
978##  Clobbers xmm1-xmm5
979##
980.type	_vpaes_schedule_mangle,%function
981.align	4
982_vpaes_schedule_mangle:
983	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
984						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
985	cbnz	$dir, .Lschedule_mangle_dec
986
987	// encrypting
988	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
989	add	$out, $out, #16			// add	\$16,	%rdx
990	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
991	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
992	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
993	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
994	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
995	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
996
997	b	.Lschedule_mangle_both
998.align	4
999.Lschedule_mangle_dec:
1000	// inverse mix columns
1001						// lea	.Lk_dksd(%rip),%r11
1002	ushr	v1.16b, v4.16b, #4		// vpsrlb	\$4,	%xmm4,	%xmm1	# 1 = hi
1003	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
1004
1005						// vmovdqa	0x00(%r11),	%xmm2
1006	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
1007						// vmovdqa	0x10(%r11),	%xmm3
1008	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
1009	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
1010	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
1011
1012						// vmovdqa	0x20(%r11),	%xmm2
1013	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
1014	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
1015						// vmovdqa	0x30(%r11),	%xmm3
1016	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
1017	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
1018	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
1019
1020						// vmovdqa	0x40(%r11),	%xmm2
1021	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
1022	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
1023						// vmovdqa	0x50(%r11),	%xmm3
1024	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
1025	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
1026
1027						// vmovdqa	0x60(%r11),	%xmm2
1028	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
1029	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
1030						// vmovdqa	0x70(%r11),	%xmm4
1031	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
1032	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
1033	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
1034	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
1035
1036	sub	$out, $out, #16			// add	\$-16,	%rdx
1037
1038.Lschedule_mangle_both:
1039	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
1040	add	x8, x8, #64-16			// add	\$-16,	%r8
1041	and	x8, x8, #~(1<<6)		// and	\$0x30,	%r8
1042	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
1043	ret
1044.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
1045
1046.globl	vpaes_set_encrypt_key
1047.type	vpaes_set_encrypt_key,%function
1048.align	4
1049vpaes_set_encrypt_key:
1050	.inst	0xd503233f		// paciasp
1051	stp	x29,x30,[sp,#-16]!
1052	add	x29,sp,#0
1053	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1054
1055	lsr	w9, $bits, #5		// shr	\$5,%eax
1056	add	w9, w9, #5		// \$5,%eax
1057	str	w9, [$out,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1058
1059	mov	$dir, #0		// mov	\$0,%ecx
1060	mov	x8, #0x30		// mov	\$0x30,%r8d
1061	bl	_vpaes_schedule_core
1062	eor	x0, x0, x0
1063
1064	ldp	d8,d9,[sp],#16
1065	ldp	x29,x30,[sp],#16
1066	.inst	0xd50323bf		// autiasp
1067	ret
1068.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1069
1070.globl	vpaes_set_decrypt_key
1071.type	vpaes_set_decrypt_key,%function
1072.align	4
1073vpaes_set_decrypt_key:
1074	.inst	0xd503233f		// paciasp
1075	stp	x29,x30,[sp,#-16]!
1076	add	x29,sp,#0
1077	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1078
1079	lsr	w9, $bits, #5		// shr	\$5,%eax
1080	add	w9, w9, #5		// \$5,%eax
1081	str	w9, [$out,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1082	lsl	w9, w9, #4		// shl	\$4,%eax
1083	add	$out, $out, #16		// lea	16(%rdx,%rax),%rdx
1084	add	$out, $out, x9
1085
1086	mov	$dir, #1		// mov	\$1,%ecx
1087	lsr	w8, $bits, #1		// shr	\$1,%r8d
1088	and	x8, x8, #32		// and	\$32,%r8d
1089	eor	x8, x8, #32		// xor	\$32,%r8d	# nbits==192?0:32
1090	bl	_vpaes_schedule_core
1091
1092	ldp	d8,d9,[sp],#16
1093	ldp	x29,x30,[sp],#16
1094	.inst	0xd50323bf		// autiasp
1095	ret
1096.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1097___
1098}
1099{
1100my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5));
1101
1102$code.=<<___;
1103.globl	vpaes_cbc_encrypt
1104.type	vpaes_cbc_encrypt,%function
1105.align	4
1106vpaes_cbc_encrypt:
1107	cbz	$len, .Lcbc_abort
1108	cmp	w5, #0			// check direction
1109	b.eq	vpaes_cbc_decrypt
1110
1111	.inst	0xd503233f		// paciasp
1112	stp	x29,x30,[sp,#-16]!
1113	add	x29,sp,#0
1114
1115	mov	x17, $len		// reassign
1116	mov	x2,  $key		// reassign
1117
1118	ld1	{v0.16b}, [$ivec]	// load ivec
1119	bl	_vpaes_encrypt_preheat
1120	b	.Lcbc_enc_loop
1121
1122.align	4
1123.Lcbc_enc_loop:
1124	ld1	{v7.16b}, [$inp],#16	// load input
1125	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
1126	bl	_vpaes_encrypt_core
1127	st1	{v0.16b}, [$out],#16	// save output
1128	subs	x17, x17, #16
1129	b.hi	.Lcbc_enc_loop
1130
1131	st1	{v0.16b}, [$ivec]	// write ivec
1132
1133	ldp	x29,x30,[sp],#16
1134	.inst	0xd50323bf		// autiasp
1135.Lcbc_abort:
1136	ret
1137.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1138
1139.type	vpaes_cbc_decrypt,%function
1140.align	4
1141vpaes_cbc_decrypt:
1142	.inst	0xd503233f		// paciasp
1143	stp	x29,x30,[sp,#-16]!
1144	add	x29,sp,#0
1145	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1146	stp	d10,d11,[sp,#-16]!
1147	stp	d12,d13,[sp,#-16]!
1148	stp	d14,d15,[sp,#-16]!
1149
1150	mov	x17, $len		// reassign
1151	mov	x2,  $key		// reassign
1152	ld1	{v6.16b}, [$ivec]	// load ivec
1153	bl	_vpaes_decrypt_preheat
1154	tst	x17, #16
1155	b.eq	.Lcbc_dec_loop2x
1156
1157	ld1	{v7.16b}, [$inp], #16	// load input
1158	bl	_vpaes_decrypt_core
1159	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1160	orr	v6.16b, v7.16b, v7.16b	// next ivec value
1161	st1	{v0.16b}, [$out], #16
1162	subs	x17, x17, #16
1163	b.ls	.Lcbc_dec_done
1164
1165.align	4
1166.Lcbc_dec_loop2x:
1167	ld1	{v14.16b,v15.16b}, [$inp], #32
1168	bl	_vpaes_decrypt_2x
1169	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1170	eor	v1.16b, v1.16b, v14.16b
1171	orr	v6.16b, v15.16b, v15.16b
1172	st1	{v0.16b,v1.16b}, [$out], #32
1173	subs	x17, x17, #32
1174	b.hi	.Lcbc_dec_loop2x
1175
1176.Lcbc_dec_done:
1177	st1	{v6.16b}, [$ivec]
1178
1179	ldp	d14,d15,[sp],#16
1180	ldp	d12,d13,[sp],#16
1181	ldp	d10,d11,[sp],#16
1182	ldp	d8,d9,[sp],#16
1183	ldp	x29,x30,[sp],#16
1184	.inst	0xd50323bf		// autiasp
1185	ret
1186.size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1187___
1188if (1) {
1189$code.=<<___;
1190.globl	vpaes_ecb_encrypt
1191.type	vpaes_ecb_encrypt,%function
1192.align	4
1193vpaes_ecb_encrypt:
1194	.inst	0xd503233f		// paciasp
1195	stp	x29,x30,[sp,#-16]!
1196	add	x29,sp,#0
1197	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1198	stp	d10,d11,[sp,#-16]!
1199	stp	d12,d13,[sp,#-16]!
1200	stp	d14,d15,[sp,#-16]!
1201
1202	mov	x17, $len
1203	mov	x2,  $key
1204	bl	_vpaes_encrypt_preheat
1205	tst	x17, #16
1206	b.eq	.Lecb_enc_loop
1207
1208	ld1	{v7.16b}, [$inp],#16
1209	bl	_vpaes_encrypt_core
1210	st1	{v0.16b}, [$out],#16
1211	subs	x17, x17, #16
1212	b.ls	.Lecb_enc_done
1213
1214.align	4
1215.Lecb_enc_loop:
1216	ld1	{v14.16b,v15.16b}, [$inp], #32
1217	bl	_vpaes_encrypt_2x
1218	st1	{v0.16b,v1.16b}, [$out], #32
1219	subs	x17, x17, #32
1220	b.hi	.Lecb_enc_loop
1221
1222.Lecb_enc_done:
1223	ldp	d14,d15,[sp],#16
1224	ldp	d12,d13,[sp],#16
1225	ldp	d10,d11,[sp],#16
1226	ldp	d8,d9,[sp],#16
1227	ldp	x29,x30,[sp],#16
1228	.inst	0xd50323bf		// autiasp
1229	ret
1230.size	vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1231
1232.globl	vpaes_ecb_decrypt
1233.type	vpaes_ecb_decrypt,%function
1234.align	4
1235vpaes_ecb_decrypt:
1236	.inst	0xd503233f		// paciasp
1237	stp	x29,x30,[sp,#-16]!
1238	add	x29,sp,#0
1239	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1240	stp	d10,d11,[sp,#-16]!
1241	stp	d12,d13,[sp,#-16]!
1242	stp	d14,d15,[sp,#-16]!
1243
1244	mov	x17, $len
1245	mov	x2,  $key
1246	bl	_vpaes_decrypt_preheat
1247	tst	x17, #16
1248	b.eq	.Lecb_dec_loop
1249
1250	ld1	{v7.16b}, [$inp],#16
1251	bl	_vpaes_encrypt_core
1252	st1	{v0.16b}, [$out],#16
1253	subs	x17, x17, #16
1254	b.ls	.Lecb_dec_done
1255
1256.align	4
1257.Lecb_dec_loop:
1258	ld1	{v14.16b,v15.16b}, [$inp], #32
1259	bl	_vpaes_decrypt_2x
1260	st1	{v0.16b,v1.16b}, [$out], #32
1261	subs	x17, x17, #32
1262	b.hi	.Lecb_dec_loop
1263
1264.Lecb_dec_done:
1265	ldp	d14,d15,[sp],#16
1266	ldp	d12,d13,[sp],#16
1267	ldp	d10,d11,[sp],#16
1268	ldp	d8,d9,[sp],#16
1269	ldp	x29,x30,[sp],#16
1270	.inst	0xd50323bf		// autiasp
1271	ret
1272.size	vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
1273___
1274}	}
1275print $code;
1276
1277close STDOUT;
1278