1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from vpaes-armv8.pl. */
2bd9588bcSAndrew Turner#include "arm_arch.h"
3bd9588bcSAndrew Turner
4bc3d5698SJohn Baldwin.text
5bc3d5698SJohn Baldwin
6bc3d5698SJohn Baldwin.type	_vpaes_consts,%object
7bc3d5698SJohn Baldwin.align	7	// totally strategic alignment
8bc3d5698SJohn Baldwin_vpaes_consts:
9bc3d5698SJohn Baldwin.Lk_mc_forward:	//	mc_forward
10bc3d5698SJohn Baldwin.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
11bc3d5698SJohn Baldwin.quad	0x080B0A0904070605, 0x000302010C0F0E0D
12bc3d5698SJohn Baldwin.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
13bc3d5698SJohn Baldwin.quad	0x000302010C0F0E0D, 0x080B0A0904070605
14bc3d5698SJohn Baldwin.Lk_mc_backward:	//	mc_backward
15bc3d5698SJohn Baldwin.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
16bc3d5698SJohn Baldwin.quad	0x020100030E0D0C0F, 0x0A09080B06050407
17bc3d5698SJohn Baldwin.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
18bc3d5698SJohn Baldwin.quad	0x0A09080B06050407, 0x020100030E0D0C0F
19bc3d5698SJohn Baldwin.Lk_sr:	//	sr
20bc3d5698SJohn Baldwin.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
21bc3d5698SJohn Baldwin.quad	0x030E09040F0A0500, 0x0B06010C07020D08
22bc3d5698SJohn Baldwin.quad	0x0F060D040B020900, 0x070E050C030A0108
23bc3d5698SJohn Baldwin.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
24bc3d5698SJohn Baldwin
25bc3d5698SJohn Baldwin//
26bc3d5698SJohn Baldwin// "Hot" constants
27bc3d5698SJohn Baldwin//
28bc3d5698SJohn Baldwin.Lk_inv:	//	inv, inva
29bc3d5698SJohn Baldwin.quad	0x0E05060F0D080180, 0x040703090A0B0C02
30bc3d5698SJohn Baldwin.quad	0x01040A060F0B0780, 0x030D0E0C02050809
31bc3d5698SJohn Baldwin.Lk_ipt:	//	input transform (lo, hi)
32bc3d5698SJohn Baldwin.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
33bc3d5698SJohn Baldwin.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
34bc3d5698SJohn Baldwin.Lk_sbo:	//	sbou, sbot
35bc3d5698SJohn Baldwin.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
36bc3d5698SJohn Baldwin.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
37bc3d5698SJohn Baldwin.Lk_sb1:	//	sb1u, sb1t
38bc3d5698SJohn Baldwin.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
39bc3d5698SJohn Baldwin.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
40bc3d5698SJohn Baldwin.Lk_sb2:	//	sb2u, sb2t
41bc3d5698SJohn Baldwin.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
42bc3d5698SJohn Baldwin.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
43bc3d5698SJohn Baldwin
44bc3d5698SJohn Baldwin//
45bc3d5698SJohn Baldwin//  Decryption stuff
46bc3d5698SJohn Baldwin//
47bc3d5698SJohn Baldwin.Lk_dipt:	//	decryption input transform
48bc3d5698SJohn Baldwin.quad	0x0F505B040B545F00, 0x154A411E114E451A
49bc3d5698SJohn Baldwin.quad	0x86E383E660056500, 0x12771772F491F194
50bc3d5698SJohn Baldwin.Lk_dsbo:	//	decryption sbox final output
51bc3d5698SJohn Baldwin.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
52bc3d5698SJohn Baldwin.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
53bc3d5698SJohn Baldwin.Lk_dsb9:	//	decryption sbox output *9*u, *9*t
54bc3d5698SJohn Baldwin.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
55bc3d5698SJohn Baldwin.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
56bc3d5698SJohn Baldwin.Lk_dsbd:	//	decryption sbox output *D*u, *D*t
57bc3d5698SJohn Baldwin.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
58bc3d5698SJohn Baldwin.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
59bc3d5698SJohn Baldwin.Lk_dsbb:	//	decryption sbox output *B*u, *B*t
60bc3d5698SJohn Baldwin.quad	0xD022649296B44200, 0x602646F6B0F2D404
61bc3d5698SJohn Baldwin.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
62bc3d5698SJohn Baldwin.Lk_dsbe:	//	decryption sbox output *E*u, *E*t
63bc3d5698SJohn Baldwin.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
64bc3d5698SJohn Baldwin.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
65bc3d5698SJohn Baldwin
66bc3d5698SJohn Baldwin//
67bc3d5698SJohn Baldwin//  Key schedule constants
68bc3d5698SJohn Baldwin//
69bc3d5698SJohn Baldwin.Lk_dksd:	//	decryption key schedule: invskew x*D
70bc3d5698SJohn Baldwin.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
71bc3d5698SJohn Baldwin.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
72bc3d5698SJohn Baldwin.Lk_dksb:	//	decryption key schedule: invskew x*B
73bc3d5698SJohn Baldwin.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
74bc3d5698SJohn Baldwin.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
75bc3d5698SJohn Baldwin.Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
76bc3d5698SJohn Baldwin.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
77bc3d5698SJohn Baldwin.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
78bc3d5698SJohn Baldwin.Lk_dks9:	//	decryption key schedule: invskew x*9
79bc3d5698SJohn Baldwin.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
80bc3d5698SJohn Baldwin.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
81bc3d5698SJohn Baldwin
82bc3d5698SJohn Baldwin.Lk_rcon:	//	rcon
83bc3d5698SJohn Baldwin.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
84bc3d5698SJohn Baldwin
85bc3d5698SJohn Baldwin.Lk_opt:	//	output transform
86bc3d5698SJohn Baldwin.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
87bc3d5698SJohn Baldwin.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
88bc3d5698SJohn Baldwin.Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
89bc3d5698SJohn Baldwin.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
90bc3d5698SJohn Baldwin.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
91bc3d5698SJohn Baldwin
92bc3d5698SJohn Baldwin.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
93bc3d5698SJohn Baldwin.align	2
94bc3d5698SJohn Baldwin.size	_vpaes_consts,.-_vpaes_consts
95bc3d5698SJohn Baldwin.align	6
96c0855eaaSJohn Baldwin//
97c0855eaaSJohn Baldwin//  _aes_preheat
98c0855eaaSJohn Baldwin//
99c0855eaaSJohn Baldwin//  Fills register %r10 -> .aes_consts (so you can -fPIC)
100c0855eaaSJohn Baldwin//  and %xmm9-%xmm15 as specified below.
101c0855eaaSJohn Baldwin//
102bc3d5698SJohn Baldwin.type	_vpaes_encrypt_preheat,%function
103bc3d5698SJohn Baldwin.align	4
104bc3d5698SJohn Baldwin_vpaes_encrypt_preheat:
105bc3d5698SJohn Baldwin	adr	x10, .Lk_inv
106bc3d5698SJohn Baldwin	movi	v17.16b, #0x0f
107bc3d5698SJohn Baldwin	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
108bc3d5698SJohn Baldwin	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
109bc3d5698SJohn Baldwin	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
110bc3d5698SJohn Baldwin	ret
111bc3d5698SJohn Baldwin.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
112bc3d5698SJohn Baldwin
113c0855eaaSJohn Baldwin//
114c0855eaaSJohn Baldwin//  _aes_encrypt_core
115c0855eaaSJohn Baldwin//
116c0855eaaSJohn Baldwin//  AES-encrypt %xmm0.
117c0855eaaSJohn Baldwin//
118c0855eaaSJohn Baldwin//  Inputs:
119c0855eaaSJohn Baldwin//     %xmm0 = input
120c0855eaaSJohn Baldwin//     %xmm9-%xmm15 as in _vpaes_preheat
121c0855eaaSJohn Baldwin//    (%rdx) = scheduled keys
122c0855eaaSJohn Baldwin//
123c0855eaaSJohn Baldwin//  Output in %xmm0
124c0855eaaSJohn Baldwin//  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
125c0855eaaSJohn Baldwin//  Preserves %xmm6 - %xmm8 so you get some local vectors
126c0855eaaSJohn Baldwin//
127c0855eaaSJohn Baldwin//
128bc3d5698SJohn Baldwin.type	_vpaes_encrypt_core,%function
129bc3d5698SJohn Baldwin.align	4
130bc3d5698SJohn Baldwin_vpaes_encrypt_core:
131bc3d5698SJohn Baldwin	mov	x9, x2
132bc3d5698SJohn Baldwin	ldr	w8, [x2,#240]			// pull rounds
133bc3d5698SJohn Baldwin	adr	x11, .Lk_mc_forward+16
134bc3d5698SJohn Baldwin						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
135bc3d5698SJohn Baldwin	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
136bc3d5698SJohn Baldwin	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
137bc3d5698SJohn Baldwin	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
138bc3d5698SJohn Baldwin	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
139bc3d5698SJohn Baldwin						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
140bc3d5698SJohn Baldwin	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
141bc3d5698SJohn Baldwin	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
142bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
143bc3d5698SJohn Baldwin	b	.Lenc_entry
144bc3d5698SJohn Baldwin
145bc3d5698SJohn Baldwin.align	4
146bc3d5698SJohn Baldwin.Lenc_loop:
147bc3d5698SJohn Baldwin	// middle of middle round
148bc3d5698SJohn Baldwin	add	x10, x11, #0x40
149bc3d5698SJohn Baldwin	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
150bc3d5698SJohn Baldwin	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
151bc3d5698SJohn Baldwin	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
152bc3d5698SJohn Baldwin	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
153bc3d5698SJohn Baldwin	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
154bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
155bc3d5698SJohn Baldwin	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
156bc3d5698SJohn Baldwin	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
157bc3d5698SJohn Baldwin	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
158bc3d5698SJohn Baldwin	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
159bc3d5698SJohn Baldwin	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
160bc3d5698SJohn Baldwin	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
161bc3d5698SJohn Baldwin	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
162bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
163bc3d5698SJohn Baldwin	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
164bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
165bc3d5698SJohn Baldwin	sub	w8, w8, #1			// nr--
166bc3d5698SJohn Baldwin
167bc3d5698SJohn Baldwin.Lenc_entry:
168bc3d5698SJohn Baldwin	// top of round
169bc3d5698SJohn Baldwin	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
170bc3d5698SJohn Baldwin	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
171bc3d5698SJohn Baldwin	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
172bc3d5698SJohn Baldwin	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
173bc3d5698SJohn Baldwin	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
174bc3d5698SJohn Baldwin	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
175bc3d5698SJohn Baldwin	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
176bc3d5698SJohn Baldwin	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
177bc3d5698SJohn Baldwin	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
178bc3d5698SJohn Baldwin	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
179bc3d5698SJohn Baldwin	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
180bc3d5698SJohn Baldwin	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
181bc3d5698SJohn Baldwin	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
182bc3d5698SJohn Baldwin	cbnz	w8, .Lenc_loop
183bc3d5698SJohn Baldwin
184bc3d5698SJohn Baldwin	// middle of last round
185bc3d5698SJohn Baldwin	add	x10, x11, #0x80
186bc3d5698SJohn Baldwin						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
187bc3d5698SJohn Baldwin						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
188bc3d5698SJohn Baldwin	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
189bc3d5698SJohn Baldwin	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
190bc3d5698SJohn Baldwin	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
191bc3d5698SJohn Baldwin	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
192bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
193bc3d5698SJohn Baldwin	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
194bc3d5698SJohn Baldwin	ret
195bc3d5698SJohn Baldwin.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
196bc3d5698SJohn Baldwin
197bc3d5698SJohn Baldwin.globl	vpaes_encrypt
198bc3d5698SJohn Baldwin.type	vpaes_encrypt,%function
199bc3d5698SJohn Baldwin.align	4
200bc3d5698SJohn Baldwinvpaes_encrypt:
201bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
202bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-16]!
203bc3d5698SJohn Baldwin	add	x29,sp,#0
204bc3d5698SJohn Baldwin
205bc3d5698SJohn Baldwin	ld1	{v7.16b}, [x0]
206bc3d5698SJohn Baldwin	bl	_vpaes_encrypt_preheat
207bc3d5698SJohn Baldwin	bl	_vpaes_encrypt_core
208bc3d5698SJohn Baldwin	st1	{v0.16b}, [x1]
209bc3d5698SJohn Baldwin
210bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#16
211bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
212bc3d5698SJohn Baldwin	ret
213bc3d5698SJohn Baldwin.size	vpaes_encrypt,.-vpaes_encrypt
214bc3d5698SJohn Baldwin
215bc3d5698SJohn Baldwin.type	_vpaes_encrypt_2x,%function
216bc3d5698SJohn Baldwin.align	4
217bc3d5698SJohn Baldwin_vpaes_encrypt_2x:
218bc3d5698SJohn Baldwin	mov	x9, x2
219bc3d5698SJohn Baldwin	ldr	w8, [x2,#240]			// pull rounds
220bc3d5698SJohn Baldwin	adr	x11, .Lk_mc_forward+16
221bc3d5698SJohn Baldwin						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
222bc3d5698SJohn Baldwin	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
223bc3d5698SJohn Baldwin	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
224bc3d5698SJohn Baldwin	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
225bc3d5698SJohn Baldwin	and	v9.16b,  v15.16b,  v17.16b
226bc3d5698SJohn Baldwin	ushr	v8.16b,  v15.16b,  #4
227bc3d5698SJohn Baldwin	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
228bc3d5698SJohn Baldwin	tbl	v9.16b,  {v20.16b}, v9.16b
229bc3d5698SJohn Baldwin						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
230bc3d5698SJohn Baldwin	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
231bc3d5698SJohn Baldwin	tbl	v10.16b, {v21.16b}, v8.16b
232bc3d5698SJohn Baldwin	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
233bc3d5698SJohn Baldwin	eor	v8.16b,  v9.16b,   v16.16b
234bc3d5698SJohn Baldwin	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
235bc3d5698SJohn Baldwin	eor	v8.16b,  v8.16b,   v10.16b
236bc3d5698SJohn Baldwin	b	.Lenc_2x_entry
237bc3d5698SJohn Baldwin
238bc3d5698SJohn Baldwin.align	4
239bc3d5698SJohn Baldwin.Lenc_2x_loop:
240bc3d5698SJohn Baldwin	// middle of middle round
241bc3d5698SJohn Baldwin	add	x10, x11, #0x40
242bc3d5698SJohn Baldwin	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
243bc3d5698SJohn Baldwin	tbl	v12.16b, {v25.16b}, v10.16b
244bc3d5698SJohn Baldwin	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
245bc3d5698SJohn Baldwin	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
246bc3d5698SJohn Baldwin	tbl	v8.16b,  {v24.16b}, v11.16b
247bc3d5698SJohn Baldwin	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
248bc3d5698SJohn Baldwin	eor	v12.16b, v12.16b, v16.16b
249bc3d5698SJohn Baldwin	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
250bc3d5698SJohn Baldwin	tbl	v13.16b, {v27.16b}, v10.16b
251bc3d5698SJohn Baldwin	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
252bc3d5698SJohn Baldwin	eor	v8.16b,  v8.16b,  v12.16b
253bc3d5698SJohn Baldwin	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
254bc3d5698SJohn Baldwin	tbl	v10.16b, {v26.16b}, v11.16b
255bc3d5698SJohn Baldwin	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
256bc3d5698SJohn Baldwin	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
257bc3d5698SJohn Baldwin	tbl	v11.16b, {v8.16b}, v1.16b
258bc3d5698SJohn Baldwin	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
259bc3d5698SJohn Baldwin	eor	v10.16b, v10.16b, v13.16b
260bc3d5698SJohn Baldwin	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
261bc3d5698SJohn Baldwin	tbl	v8.16b,  {v8.16b}, v4.16b
262bc3d5698SJohn Baldwin	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
263bc3d5698SJohn Baldwin	eor	v11.16b, v11.16b, v10.16b
264bc3d5698SJohn Baldwin	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
265bc3d5698SJohn Baldwin	tbl	v12.16b, {v11.16b},v1.16b
266bc3d5698SJohn Baldwin	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
267bc3d5698SJohn Baldwin	eor	v8.16b,  v8.16b,  v11.16b
268bc3d5698SJohn Baldwin	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
269bc3d5698SJohn Baldwin	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
270bc3d5698SJohn Baldwin	eor	v8.16b,  v8.16b,  v12.16b
271bc3d5698SJohn Baldwin	sub	w8, w8, #1			// nr--
272bc3d5698SJohn Baldwin
273bc3d5698SJohn Baldwin.Lenc_2x_entry:
274bc3d5698SJohn Baldwin	// top of round
275bc3d5698SJohn Baldwin	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
276bc3d5698SJohn Baldwin	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
277bc3d5698SJohn Baldwin	and	v9.16b,  v8.16b, v17.16b
278bc3d5698SJohn Baldwin	ushr	v8.16b,  v8.16b, #4
279bc3d5698SJohn Baldwin	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
280bc3d5698SJohn Baldwin	tbl	v13.16b, {v19.16b},v9.16b
281bc3d5698SJohn Baldwin	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
282bc3d5698SJohn Baldwin	eor	v9.16b,  v9.16b,  v8.16b
283bc3d5698SJohn Baldwin	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
284bc3d5698SJohn Baldwin	tbl	v11.16b, {v18.16b},v8.16b
285bc3d5698SJohn Baldwin	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
286bc3d5698SJohn Baldwin	tbl	v12.16b, {v18.16b},v9.16b
287bc3d5698SJohn Baldwin	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
288bc3d5698SJohn Baldwin	eor	v11.16b, v11.16b, v13.16b
289bc3d5698SJohn Baldwin	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
290bc3d5698SJohn Baldwin	eor	v12.16b, v12.16b, v13.16b
291bc3d5698SJohn Baldwin	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
292bc3d5698SJohn Baldwin	tbl	v10.16b, {v18.16b},v11.16b
293bc3d5698SJohn Baldwin	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
294bc3d5698SJohn Baldwin	tbl	v11.16b, {v18.16b},v12.16b
295bc3d5698SJohn Baldwin	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
296bc3d5698SJohn Baldwin	eor	v10.16b, v10.16b, v9.16b
297bc3d5698SJohn Baldwin	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
298bc3d5698SJohn Baldwin	eor	v11.16b, v11.16b, v8.16b
299bc3d5698SJohn Baldwin	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
300bc3d5698SJohn Baldwin	cbnz	w8, .Lenc_2x_loop
301bc3d5698SJohn Baldwin
302bc3d5698SJohn Baldwin	// middle of last round
303bc3d5698SJohn Baldwin	add	x10, x11, #0x80
304bc3d5698SJohn Baldwin						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
305bc3d5698SJohn Baldwin						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
306bc3d5698SJohn Baldwin	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
307bc3d5698SJohn Baldwin	tbl	v12.16b, {v22.16b}, v10.16b
308bc3d5698SJohn Baldwin	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
309bc3d5698SJohn Baldwin	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
310bc3d5698SJohn Baldwin	tbl	v8.16b,  {v23.16b}, v11.16b
311bc3d5698SJohn Baldwin	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
312bc3d5698SJohn Baldwin	eor	v12.16b, v12.16b, v16.16b
313bc3d5698SJohn Baldwin	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
314bc3d5698SJohn Baldwin	eor	v8.16b,  v8.16b,  v12.16b
315bc3d5698SJohn Baldwin	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
316bc3d5698SJohn Baldwin	tbl	v1.16b,  {v8.16b},v1.16b
317bc3d5698SJohn Baldwin	ret
318bc3d5698SJohn Baldwin.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
319bc3d5698SJohn Baldwin
320bc3d5698SJohn Baldwin.type	_vpaes_decrypt_preheat,%function
321bc3d5698SJohn Baldwin.align	4
322bc3d5698SJohn Baldwin_vpaes_decrypt_preheat:
323bc3d5698SJohn Baldwin	adr	x10, .Lk_inv
324bc3d5698SJohn Baldwin	movi	v17.16b, #0x0f
325bc3d5698SJohn Baldwin	adr	x11, .Lk_dipt
326bc3d5698SJohn Baldwin	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
327bc3d5698SJohn Baldwin	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
328bc3d5698SJohn Baldwin	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
329bc3d5698SJohn Baldwin	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
330bc3d5698SJohn Baldwin	ret
331bc3d5698SJohn Baldwin.size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
332bc3d5698SJohn Baldwin
333c0855eaaSJohn Baldwin//
334c0855eaaSJohn Baldwin//  Decryption core
335c0855eaaSJohn Baldwin//
336c0855eaaSJohn Baldwin//  Same API as encryption core.
337c0855eaaSJohn Baldwin//
338bc3d5698SJohn Baldwin.type	_vpaes_decrypt_core,%function
339bc3d5698SJohn Baldwin.align	4
340bc3d5698SJohn Baldwin_vpaes_decrypt_core:
341bc3d5698SJohn Baldwin	mov	x9, x2
342bc3d5698SJohn Baldwin	ldr	w8, [x2,#240]			// pull rounds
343bc3d5698SJohn Baldwin
344bc3d5698SJohn Baldwin						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
345bc3d5698SJohn Baldwin	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
346bc3d5698SJohn Baldwin	eor	x11, x11, #0x30			// xor		$0x30,	%r11
347bc3d5698SJohn Baldwin	adr	x10, .Lk_sr
348bc3d5698SJohn Baldwin	and	x11, x11, #0x30			// and		$0x30,	%r11
349bc3d5698SJohn Baldwin	add	x11, x11, x10
350bc3d5698SJohn Baldwin	adr	x10, .Lk_mc_forward+48
351bc3d5698SJohn Baldwin
352bc3d5698SJohn Baldwin	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
353bc3d5698SJohn Baldwin	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
354bc3d5698SJohn Baldwin	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
355bc3d5698SJohn Baldwin	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
356bc3d5698SJohn Baldwin	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
357bc3d5698SJohn Baldwin						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
358bc3d5698SJohn Baldwin	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
359bc3d5698SJohn Baldwin	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
360bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
361bc3d5698SJohn Baldwin	b	.Ldec_entry
362bc3d5698SJohn Baldwin
363bc3d5698SJohn Baldwin.align	4
364bc3d5698SJohn Baldwin.Ldec_loop:
365bc3d5698SJohn Baldwin//
366bc3d5698SJohn Baldwin//  Inverse mix columns
367bc3d5698SJohn Baldwin//
368bc3d5698SJohn Baldwin						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
369bc3d5698SJohn Baldwin						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
370bc3d5698SJohn Baldwin	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
371bc3d5698SJohn Baldwin	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
372bc3d5698SJohn Baldwin	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
373bc3d5698SJohn Baldwin						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
374bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
375bc3d5698SJohn Baldwin						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
376bc3d5698SJohn Baldwin
377bc3d5698SJohn Baldwin	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
378bc3d5698SJohn Baldwin	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
379bc3d5698SJohn Baldwin	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
380bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
381bc3d5698SJohn Baldwin						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
382bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
383bc3d5698SJohn Baldwin						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
384bc3d5698SJohn Baldwin
385bc3d5698SJohn Baldwin	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
386bc3d5698SJohn Baldwin	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
387bc3d5698SJohn Baldwin	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
388bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
389bc3d5698SJohn Baldwin						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
390bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
391bc3d5698SJohn Baldwin						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
392bc3d5698SJohn Baldwin
393bc3d5698SJohn Baldwin	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
394bc3d5698SJohn Baldwin	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
395bc3d5698SJohn Baldwin	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
396bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
397bc3d5698SJohn Baldwin	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
398bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
399bc3d5698SJohn Baldwin	sub	w8, w8, #1			// sub		$1,%rax			# nr--
400bc3d5698SJohn Baldwin
401bc3d5698SJohn Baldwin.Ldec_entry:
402bc3d5698SJohn Baldwin	// top of round
403bc3d5698SJohn Baldwin	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
404bc3d5698SJohn Baldwin	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
405bc3d5698SJohn Baldwin	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
406bc3d5698SJohn Baldwin	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
407bc3d5698SJohn Baldwin	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
408bc3d5698SJohn Baldwin	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
409bc3d5698SJohn Baldwin	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
410bc3d5698SJohn Baldwin	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
411bc3d5698SJohn Baldwin	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
412bc3d5698SJohn Baldwin	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
413bc3d5698SJohn Baldwin	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
414bc3d5698SJohn Baldwin	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
415bc3d5698SJohn Baldwin	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
416bc3d5698SJohn Baldwin	cbnz	w8, .Ldec_loop
417bc3d5698SJohn Baldwin
418bc3d5698SJohn Baldwin	// middle of last round
419bc3d5698SJohn Baldwin						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
420bc3d5698SJohn Baldwin	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
421bc3d5698SJohn Baldwin						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
422bc3d5698SJohn Baldwin	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
423bc3d5698SJohn Baldwin	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
424bc3d5698SJohn Baldwin	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
425bc3d5698SJohn Baldwin	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
426bc3d5698SJohn Baldwin	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
427bc3d5698SJohn Baldwin	ret
428bc3d5698SJohn Baldwin.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
429bc3d5698SJohn Baldwin
430bc3d5698SJohn Baldwin.globl	vpaes_decrypt
431bc3d5698SJohn Baldwin.type	vpaes_decrypt,%function
432bc3d5698SJohn Baldwin.align	4
433bc3d5698SJohn Baldwinvpaes_decrypt:
434bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
435bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-16]!
436bc3d5698SJohn Baldwin	add	x29,sp,#0
437bc3d5698SJohn Baldwin
438bc3d5698SJohn Baldwin	ld1	{v7.16b}, [x0]
439bc3d5698SJohn Baldwin	bl	_vpaes_decrypt_preheat
440bc3d5698SJohn Baldwin	bl	_vpaes_decrypt_core
441bc3d5698SJohn Baldwin	st1	{v0.16b}, [x1]
442bc3d5698SJohn Baldwin
443bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#16
444bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
445bc3d5698SJohn Baldwin	ret
446bc3d5698SJohn Baldwin.size	vpaes_decrypt,.-vpaes_decrypt
447bc3d5698SJohn Baldwin
448bc3d5698SJohn Baldwin// v14-v15 input, v0-v1 output
449bc3d5698SJohn Baldwin.type	_vpaes_decrypt_2x,%function
450bc3d5698SJohn Baldwin.align	4
451bc3d5698SJohn Baldwin_vpaes_decrypt_2x:
452bc3d5698SJohn Baldwin	mov	x9, x2
453bc3d5698SJohn Baldwin	ldr	w8, [x2,#240]			// pull rounds
454bc3d5698SJohn Baldwin
455bc3d5698SJohn Baldwin						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
456bc3d5698SJohn Baldwin	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
457bc3d5698SJohn Baldwin	eor	x11, x11, #0x30			// xor		$0x30,	%r11
458bc3d5698SJohn Baldwin	adr	x10, .Lk_sr
459bc3d5698SJohn Baldwin	and	x11, x11, #0x30			// and		$0x30,	%r11
460bc3d5698SJohn Baldwin	add	x11, x11, x10
461bc3d5698SJohn Baldwin	adr	x10, .Lk_mc_forward+48
462bc3d5698SJohn Baldwin
463bc3d5698SJohn Baldwin	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
464bc3d5698SJohn Baldwin	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
465bc3d5698SJohn Baldwin	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
466bc3d5698SJohn Baldwin	and	v9.16b,  v15.16b, v17.16b
467bc3d5698SJohn Baldwin	ushr	v8.16b,  v15.16b, #4
468bc3d5698SJohn Baldwin	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
469bc3d5698SJohn Baldwin	tbl	v10.16b, {v20.16b},v9.16b
470bc3d5698SJohn Baldwin	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
471bc3d5698SJohn Baldwin						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
472bc3d5698SJohn Baldwin	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
473bc3d5698SJohn Baldwin	tbl	v8.16b,  {v21.16b},v8.16b
474bc3d5698SJohn Baldwin	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
475bc3d5698SJohn Baldwin	eor	v10.16b, v10.16b, v16.16b
476bc3d5698SJohn Baldwin	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
477bc3d5698SJohn Baldwin	eor	v8.16b,  v8.16b,  v10.16b
478bc3d5698SJohn Baldwin	b	.Ldec_2x_entry
479bc3d5698SJohn Baldwin
480bc3d5698SJohn Baldwin.align	4
481bc3d5698SJohn Baldwin.Ldec_2x_loop:
482bc3d5698SJohn Baldwin//
483bc3d5698SJohn Baldwin//  Inverse mix columns
484bc3d5698SJohn Baldwin//
485bc3d5698SJohn Baldwin						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
486bc3d5698SJohn Baldwin						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
487bc3d5698SJohn Baldwin	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
488bc3d5698SJohn Baldwin	tbl	v12.16b, {v24.16b}, v10.16b
489bc3d5698SJohn Baldwin	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
490bc3d5698SJohn Baldwin	tbl	v9.16b,  {v25.16b}, v11.16b
491bc3d5698SJohn Baldwin	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
492bc3d5698SJohn Baldwin	eor	v8.16b,  v12.16b, v16.16b
493bc3d5698SJohn Baldwin						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
494bc3d5698SJohn Baldwin	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
495bc3d5698SJohn Baldwin	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
496bc3d5698SJohn Baldwin						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
497bc3d5698SJohn Baldwin
498bc3d5698SJohn Baldwin	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
499bc3d5698SJohn Baldwin	tbl	v12.16b, {v26.16b}, v10.16b
500bc3d5698SJohn Baldwin	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
501bc3d5698SJohn Baldwin	tbl	v8.16b,  {v8.16b},v5.16b
502bc3d5698SJohn Baldwin	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
503bc3d5698SJohn Baldwin	tbl	v9.16b,  {v27.16b}, v11.16b
504bc3d5698SJohn Baldwin	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
505bc3d5698SJohn Baldwin	eor	v8.16b,  v8.16b,  v12.16b
506bc3d5698SJohn Baldwin						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
507bc3d5698SJohn Baldwin	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
508bc3d5698SJohn Baldwin	eor	v8.16b,  v8.16b,  v9.16b
509bc3d5698SJohn Baldwin						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
510bc3d5698SJohn Baldwin
511bc3d5698SJohn Baldwin	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
512bc3d5698SJohn Baldwin	tbl	v12.16b, {v28.16b}, v10.16b
513bc3d5698SJohn Baldwin	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
514bc3d5698SJohn Baldwin	tbl	v8.16b,  {v8.16b},v5.16b
515bc3d5698SJohn Baldwin	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
516bc3d5698SJohn Baldwin	tbl	v9.16b,  {v29.16b}, v11.16b
517bc3d5698SJohn Baldwin	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
518bc3d5698SJohn Baldwin	eor	v8.16b,  v8.16b,  v12.16b
519bc3d5698SJohn Baldwin						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
520bc3d5698SJohn Baldwin	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
521bc3d5698SJohn Baldwin	eor	v8.16b,  v8.16b,  v9.16b
522bc3d5698SJohn Baldwin						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
523bc3d5698SJohn Baldwin
524bc3d5698SJohn Baldwin	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
525bc3d5698SJohn Baldwin	tbl	v12.16b, {v30.16b}, v10.16b
526bc3d5698SJohn Baldwin	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
527bc3d5698SJohn Baldwin	tbl	v8.16b,  {v8.16b},v5.16b
528bc3d5698SJohn Baldwin	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
529bc3d5698SJohn Baldwin	tbl	v9.16b,  {v31.16b}, v11.16b
530bc3d5698SJohn Baldwin	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
531bc3d5698SJohn Baldwin	eor	v8.16b,  v8.16b,  v12.16b
532bc3d5698SJohn Baldwin	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
533bc3d5698SJohn Baldwin	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
534bc3d5698SJohn Baldwin	eor	v8.16b,  v8.16b,  v9.16b
535bc3d5698SJohn Baldwin	sub	w8, w8, #1			// sub		$1,%rax			# nr--
536bc3d5698SJohn Baldwin
537bc3d5698SJohn Baldwin.Ldec_2x_entry:
538bc3d5698SJohn Baldwin	// top of round
539bc3d5698SJohn Baldwin	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
540bc3d5698SJohn Baldwin	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
541bc3d5698SJohn Baldwin	and	v9.16b,  v8.16b,  v17.16b
542bc3d5698SJohn Baldwin	ushr	v8.16b,  v8.16b,  #4
543bc3d5698SJohn Baldwin	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
544bc3d5698SJohn Baldwin	tbl	v10.16b, {v19.16b},v9.16b
545bc3d5698SJohn Baldwin	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
546bc3d5698SJohn Baldwin	eor	v9.16b,	 v9.16b,  v8.16b
547bc3d5698SJohn Baldwin	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
548bc3d5698SJohn Baldwin	tbl	v11.16b, {v18.16b},v8.16b
549bc3d5698SJohn Baldwin	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
550bc3d5698SJohn Baldwin	tbl	v12.16b, {v18.16b},v9.16b
551bc3d5698SJohn Baldwin	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
552bc3d5698SJohn Baldwin	eor	v11.16b, v11.16b, v10.16b
553bc3d5698SJohn Baldwin	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
554bc3d5698SJohn Baldwin	eor	v12.16b, v12.16b, v10.16b
555bc3d5698SJohn Baldwin	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
556bc3d5698SJohn Baldwin	tbl	v10.16b, {v18.16b},v11.16b
557bc3d5698SJohn Baldwin	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
558bc3d5698SJohn Baldwin	tbl	v11.16b, {v18.16b},v12.16b
559bc3d5698SJohn Baldwin	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
560bc3d5698SJohn Baldwin	eor	v10.16b, v10.16b, v9.16b
561bc3d5698SJohn Baldwin	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
562bc3d5698SJohn Baldwin	eor	v11.16b, v11.16b, v8.16b
563bc3d5698SJohn Baldwin	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
564bc3d5698SJohn Baldwin	cbnz	w8, .Ldec_2x_loop
565bc3d5698SJohn Baldwin
566bc3d5698SJohn Baldwin	// middle of last round
567bc3d5698SJohn Baldwin						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
568bc3d5698SJohn Baldwin	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
569bc3d5698SJohn Baldwin	tbl	v12.16b, {v22.16b}, v10.16b
570bc3d5698SJohn Baldwin						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
571bc3d5698SJohn Baldwin	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
572bc3d5698SJohn Baldwin	tbl	v9.16b,  {v23.16b}, v11.16b
573bc3d5698SJohn Baldwin	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
574bc3d5698SJohn Baldwin	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
575bc3d5698SJohn Baldwin	eor	v12.16b, v12.16b, v16.16b
576bc3d5698SJohn Baldwin	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
577bc3d5698SJohn Baldwin	eor	v8.16b,  v9.16b,  v12.16b
578bc3d5698SJohn Baldwin	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
579bc3d5698SJohn Baldwin	tbl	v1.16b,  {v8.16b},v2.16b
580bc3d5698SJohn Baldwin	ret
581bc3d5698SJohn Baldwin.size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
582c0855eaaSJohn Baldwin////////////////////////////////////////////////////////
583c0855eaaSJohn Baldwin//                                                    //
584c0855eaaSJohn Baldwin//                  AES key schedule                  //
585c0855eaaSJohn Baldwin//                                                    //
586c0855eaaSJohn Baldwin////////////////////////////////////////////////////////
587bc3d5698SJohn Baldwin.type	_vpaes_key_preheat,%function
588bc3d5698SJohn Baldwin.align	4
589bc3d5698SJohn Baldwin_vpaes_key_preheat:
590bc3d5698SJohn Baldwin	adr	x10, .Lk_inv
591bc3d5698SJohn Baldwin	movi	v16.16b, #0x5b			// .Lk_s63
592bc3d5698SJohn Baldwin	adr	x11, .Lk_sb1
593bc3d5698SJohn Baldwin	movi	v17.16b, #0x0f			// .Lk_s0F
594bc3d5698SJohn Baldwin	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
595bc3d5698SJohn Baldwin	adr	x10, .Lk_dksd
596bc3d5698SJohn Baldwin	ld1	{v22.2d,v23.2d}, [x11]		// .Lk_sb1
597bc3d5698SJohn Baldwin	adr	x11, .Lk_mc_forward
598bc3d5698SJohn Baldwin	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
599bc3d5698SJohn Baldwin	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
600bc3d5698SJohn Baldwin	ld1	{v8.2d}, [x10]			// .Lk_rcon
601bc3d5698SJohn Baldwin	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
602bc3d5698SJohn Baldwin	ret
603bc3d5698SJohn Baldwin.size	_vpaes_key_preheat,.-_vpaes_key_preheat
604bc3d5698SJohn Baldwin
605bc3d5698SJohn Baldwin.type	_vpaes_schedule_core,%function
606bc3d5698SJohn Baldwin.align	4
607bc3d5698SJohn Baldwin_vpaes_schedule_core:
608bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
609bc3d5698SJohn Baldwin	stp	x29, x30, [sp,#-16]!
610bc3d5698SJohn Baldwin	add	x29,sp,#0
611bc3d5698SJohn Baldwin
612bc3d5698SJohn Baldwin	bl	_vpaes_key_preheat		// load the tables
613bc3d5698SJohn Baldwin
614bc3d5698SJohn Baldwin	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
615bc3d5698SJohn Baldwin
616bc3d5698SJohn Baldwin	// input transform
617bc3d5698SJohn Baldwin	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
618bc3d5698SJohn Baldwin	bl	_vpaes_schedule_transform
619bc3d5698SJohn Baldwin	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
620bc3d5698SJohn Baldwin
621bc3d5698SJohn Baldwin	adr	x10, .Lk_sr			// lea	.Lk_sr(%rip),%r10
622bc3d5698SJohn Baldwin	add	x8, x8, x10
623bc3d5698SJohn Baldwin	cbnz	w3, .Lschedule_am_decrypting
624bc3d5698SJohn Baldwin
625bc3d5698SJohn Baldwin	// encrypting, output zeroth round key after transform
626bc3d5698SJohn Baldwin	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
627bc3d5698SJohn Baldwin	b	.Lschedule_go
628bc3d5698SJohn Baldwin
629bc3d5698SJohn Baldwin.Lschedule_am_decrypting:
630bc3d5698SJohn Baldwin	// decrypting, output zeroth round key after shiftrows
631bc3d5698SJohn Baldwin	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
632bc3d5698SJohn Baldwin	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
633bc3d5698SJohn Baldwin	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
634bc3d5698SJohn Baldwin	eor	x8, x8, #0x30			// xor	$0x30, %r8
635bc3d5698SJohn Baldwin
636bc3d5698SJohn Baldwin.Lschedule_go:
637bc3d5698SJohn Baldwin	cmp	w1, #192			// cmp	$192,	%esi
638bc3d5698SJohn Baldwin	b.hi	.Lschedule_256
639bc3d5698SJohn Baldwin	b.eq	.Lschedule_192
640bc3d5698SJohn Baldwin	// 128: fall though
641bc3d5698SJohn Baldwin
642c0855eaaSJohn Baldwin//
643c0855eaaSJohn Baldwin//  .schedule_128
644c0855eaaSJohn Baldwin//
645c0855eaaSJohn Baldwin//  128-bit specific part of key schedule.
646c0855eaaSJohn Baldwin//
647c0855eaaSJohn Baldwin//  This schedule is really simple, because all its parts
648c0855eaaSJohn Baldwin//  are accomplished by the subroutines.
649c0855eaaSJohn Baldwin//
650bc3d5698SJohn Baldwin.Lschedule_128:
651bc3d5698SJohn Baldwin	mov	x0, #10			// mov	$10, %esi
652bc3d5698SJohn Baldwin
653bc3d5698SJohn Baldwin.Loop_schedule_128:
654bc3d5698SJohn Baldwin	sub	x0, x0, #1			// dec	%esi
655bc3d5698SJohn Baldwin	bl	_vpaes_schedule_round
656bc3d5698SJohn Baldwin	cbz	x0, .Lschedule_mangle_last
657bc3d5698SJohn Baldwin	bl	_vpaes_schedule_mangle		// write output
658bc3d5698SJohn Baldwin	b	.Loop_schedule_128
659bc3d5698SJohn Baldwin
660c0855eaaSJohn Baldwin//
661c0855eaaSJohn Baldwin//  .aes_schedule_192
662c0855eaaSJohn Baldwin//
663c0855eaaSJohn Baldwin//  192-bit specific part of key schedule.
664c0855eaaSJohn Baldwin//
665c0855eaaSJohn Baldwin//  The main body of this schedule is the same as the 128-bit
666c0855eaaSJohn Baldwin//  schedule, but with more smearing.  The long, high side is
667c0855eaaSJohn Baldwin//  stored in %xmm7 as before, and the short, low side is in
668c0855eaaSJohn Baldwin//  the high bits of %xmm6.
669c0855eaaSJohn Baldwin//
670c0855eaaSJohn Baldwin//  This schedule is somewhat nastier, however, because each
671c0855eaaSJohn Baldwin//  round produces 192 bits of key material, or 1.5 round keys.
672c0855eaaSJohn Baldwin//  Therefore, on each cycle we do 2 rounds and produce 3 round
673c0855eaaSJohn Baldwin//  keys.
674c0855eaaSJohn Baldwin//
675bc3d5698SJohn Baldwin.align	4
676bc3d5698SJohn Baldwin.Lschedule_192:
677bc3d5698SJohn Baldwin	sub	x0, x0, #8
678bc3d5698SJohn Baldwin	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
679bc3d5698SJohn Baldwin	bl	_vpaes_schedule_transform	// input transform
680bc3d5698SJohn Baldwin	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
681bc3d5698SJohn Baldwin	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
682bc3d5698SJohn Baldwin	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
683bc3d5698SJohn Baldwin	mov	x0, #4			// mov	$4,	%esi
684bc3d5698SJohn Baldwin
685bc3d5698SJohn Baldwin.Loop_schedule_192:
686bc3d5698SJohn Baldwin	sub	x0, x0, #1			// dec	%esi
687bc3d5698SJohn Baldwin	bl	_vpaes_schedule_round
688bc3d5698SJohn Baldwin	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
689bc3d5698SJohn Baldwin	bl	_vpaes_schedule_mangle		// save key n
690bc3d5698SJohn Baldwin	bl	_vpaes_schedule_192_smear
691bc3d5698SJohn Baldwin	bl	_vpaes_schedule_mangle		// save key n+1
692bc3d5698SJohn Baldwin	bl	_vpaes_schedule_round
693bc3d5698SJohn Baldwin	cbz	x0, .Lschedule_mangle_last
694bc3d5698SJohn Baldwin	bl	_vpaes_schedule_mangle		// save key n+2
695bc3d5698SJohn Baldwin	bl	_vpaes_schedule_192_smear
696bc3d5698SJohn Baldwin	b	.Loop_schedule_192
697bc3d5698SJohn Baldwin
698c0855eaaSJohn Baldwin//
699c0855eaaSJohn Baldwin//  .aes_schedule_256
700c0855eaaSJohn Baldwin//
701c0855eaaSJohn Baldwin//  256-bit specific part of key schedule.
702c0855eaaSJohn Baldwin//
703c0855eaaSJohn Baldwin//  The structure here is very similar to the 128-bit
704c0855eaaSJohn Baldwin//  schedule, but with an additional "low side" in
705c0855eaaSJohn Baldwin//  %xmm6.  The low side's rounds are the same as the
706c0855eaaSJohn Baldwin//  high side's, except no rcon and no rotation.
707c0855eaaSJohn Baldwin//
708bc3d5698SJohn Baldwin.align	4
709bc3d5698SJohn Baldwin.Lschedule_256:
710bc3d5698SJohn Baldwin	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
711bc3d5698SJohn Baldwin	bl	_vpaes_schedule_transform	// input transform
712bc3d5698SJohn Baldwin	mov	x0, #7			// mov	$7, %esi
713bc3d5698SJohn Baldwin
714bc3d5698SJohn Baldwin.Loop_schedule_256:
715bc3d5698SJohn Baldwin	sub	x0, x0, #1			// dec	%esi
716bc3d5698SJohn Baldwin	bl	_vpaes_schedule_mangle		// output low result
717bc3d5698SJohn Baldwin	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
718bc3d5698SJohn Baldwin
719bc3d5698SJohn Baldwin	// high round
720bc3d5698SJohn Baldwin	bl	_vpaes_schedule_round
721bc3d5698SJohn Baldwin	cbz	x0, .Lschedule_mangle_last
722bc3d5698SJohn Baldwin	bl	_vpaes_schedule_mangle
723bc3d5698SJohn Baldwin
724bc3d5698SJohn Baldwin	// low round. swap xmm7 and xmm6
725bc3d5698SJohn Baldwin	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
726bc3d5698SJohn Baldwin	movi	v4.16b, #0
727bc3d5698SJohn Baldwin	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
728bc3d5698SJohn Baldwin	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
729bc3d5698SJohn Baldwin	bl	_vpaes_schedule_low_round
730bc3d5698SJohn Baldwin	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
731bc3d5698SJohn Baldwin
732bc3d5698SJohn Baldwin	b	.Loop_schedule_256
733bc3d5698SJohn Baldwin
734c0855eaaSJohn Baldwin//
735c0855eaaSJohn Baldwin//  .aes_schedule_mangle_last
736c0855eaaSJohn Baldwin//
737c0855eaaSJohn Baldwin//  Mangler for last round of key schedule
738c0855eaaSJohn Baldwin//  Mangles %xmm0
739c0855eaaSJohn Baldwin//    when encrypting, outputs out(%xmm0) ^ 63
740c0855eaaSJohn Baldwin//    when decrypting, outputs unskew(%xmm0)
741c0855eaaSJohn Baldwin//
742c0855eaaSJohn Baldwin//  Always called right before return... jumps to cleanup and exits
743c0855eaaSJohn Baldwin//
744bc3d5698SJohn Baldwin.align	4
745bc3d5698SJohn Baldwin.Lschedule_mangle_last:
746bc3d5698SJohn Baldwin	// schedule last round key from xmm0
747bc3d5698SJohn Baldwin	adr	x11, .Lk_deskew			// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
748bc3d5698SJohn Baldwin	cbnz	w3, .Lschedule_mangle_last_dec
749bc3d5698SJohn Baldwin
750bc3d5698SJohn Baldwin	// encrypting
751bc3d5698SJohn Baldwin	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
752bc3d5698SJohn Baldwin	adr	x11, .Lk_opt			// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
753bc3d5698SJohn Baldwin	add	x2, x2, #32			// add	$32,	%rdx
754bc3d5698SJohn Baldwin	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
755bc3d5698SJohn Baldwin
756bc3d5698SJohn Baldwin.Lschedule_mangle_last_dec:
757bc3d5698SJohn Baldwin	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
758bc3d5698SJohn Baldwin	sub	x2, x2, #16			// add	$-16,	%rdx
759bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
760bc3d5698SJohn Baldwin	bl	_vpaes_schedule_transform	// output transform
761bc3d5698SJohn Baldwin	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
762bc3d5698SJohn Baldwin
763bc3d5698SJohn Baldwin	// cleanup
764bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
765bc3d5698SJohn Baldwin	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
766bc3d5698SJohn Baldwin	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
767bc3d5698SJohn Baldwin	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
768bc3d5698SJohn Baldwin	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
769bc3d5698SJohn Baldwin	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
770bc3d5698SJohn Baldwin	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
771bc3d5698SJohn Baldwin	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
772bc3d5698SJohn Baldwin	ldp	x29, x30, [sp],#16
773bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
774bc3d5698SJohn Baldwin	ret
775bc3d5698SJohn Baldwin.size	_vpaes_schedule_core,.-_vpaes_schedule_core
776bc3d5698SJohn Baldwin
777c0855eaaSJohn Baldwin//
778c0855eaaSJohn Baldwin//  .aes_schedule_192_smear
779c0855eaaSJohn Baldwin//
780c0855eaaSJohn Baldwin//  Smear the short, low side in the 192-bit key schedule.
781c0855eaaSJohn Baldwin//
782c0855eaaSJohn Baldwin//  Inputs:
783c0855eaaSJohn Baldwin//    %xmm7: high side, b  a  x  y
784c0855eaaSJohn Baldwin//    %xmm6:  low side, d  c  0  0
785c0855eaaSJohn Baldwin//    %xmm13: 0
786c0855eaaSJohn Baldwin//
787c0855eaaSJohn Baldwin//  Outputs:
788c0855eaaSJohn Baldwin//    %xmm6: b+c+d  b+c  0  0
789c0855eaaSJohn Baldwin//    %xmm0: b+c+d  b+c  b  a
790c0855eaaSJohn Baldwin//
791bc3d5698SJohn Baldwin.type	_vpaes_schedule_192_smear,%function
792bc3d5698SJohn Baldwin.align	4
793bc3d5698SJohn Baldwin_vpaes_schedule_192_smear:
794bc3d5698SJohn Baldwin	movi	v1.16b, #0
795bc3d5698SJohn Baldwin	dup	v0.4s, v7.s[3]
796bc3d5698SJohn Baldwin	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
797bc3d5698SJohn Baldwin	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
798bc3d5698SJohn Baldwin	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
799bc3d5698SJohn Baldwin	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
800bc3d5698SJohn Baldwin	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
801bc3d5698SJohn Baldwin	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
802bc3d5698SJohn Baldwin	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
803bc3d5698SJohn Baldwin	ret
804bc3d5698SJohn Baldwin.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
805bc3d5698SJohn Baldwin
806c0855eaaSJohn Baldwin//
807c0855eaaSJohn Baldwin//  .aes_schedule_round
808c0855eaaSJohn Baldwin//
809c0855eaaSJohn Baldwin//  Runs one main round of the key schedule on %xmm0, %xmm7
810c0855eaaSJohn Baldwin//
811c0855eaaSJohn Baldwin//  Specifically, runs subbytes on the high dword of %xmm0
812c0855eaaSJohn Baldwin//  then rotates it by one byte and xors into the low dword of
813c0855eaaSJohn Baldwin//  %xmm7.
814c0855eaaSJohn Baldwin//
815c0855eaaSJohn Baldwin//  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
816c0855eaaSJohn Baldwin//  next rcon.
817c0855eaaSJohn Baldwin//
818c0855eaaSJohn Baldwin//  Smears the dwords of %xmm7 by xoring the low into the
819c0855eaaSJohn Baldwin//  second low, result into third, result into highest.
820c0855eaaSJohn Baldwin//
821c0855eaaSJohn Baldwin//  Returns results in %xmm7 = %xmm0.
822c0855eaaSJohn Baldwin//  Clobbers %xmm1-%xmm4, %r11.
823c0855eaaSJohn Baldwin//
824bc3d5698SJohn Baldwin.type	_vpaes_schedule_round,%function
825bc3d5698SJohn Baldwin.align	4
826bc3d5698SJohn Baldwin_vpaes_schedule_round:
827bc3d5698SJohn Baldwin	// extract rcon from xmm8
828bc3d5698SJohn Baldwin	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
829bc3d5698SJohn Baldwin	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
830bc3d5698SJohn Baldwin	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
831bc3d5698SJohn Baldwin	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
832bc3d5698SJohn Baldwin
833bc3d5698SJohn Baldwin	// rotate
834bc3d5698SJohn Baldwin	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
835bc3d5698SJohn Baldwin	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
836bc3d5698SJohn Baldwin
837bc3d5698SJohn Baldwin	// fall through...
838bc3d5698SJohn Baldwin
839bc3d5698SJohn Baldwin	// low round: same as high round, but no rotation and no rcon.
840bc3d5698SJohn Baldwin_vpaes_schedule_low_round:
841bc3d5698SJohn Baldwin	// smear xmm7
842bc3d5698SJohn Baldwin	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
843bc3d5698SJohn Baldwin	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
844bc3d5698SJohn Baldwin	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
845bc3d5698SJohn Baldwin
846bc3d5698SJohn Baldwin	// subbytes
847bc3d5698SJohn Baldwin	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
848bc3d5698SJohn Baldwin	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
849bc3d5698SJohn Baldwin	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
850bc3d5698SJohn Baldwin	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
851bc3d5698SJohn Baldwin	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
852bc3d5698SJohn Baldwin	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
853bc3d5698SJohn Baldwin	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
854bc3d5698SJohn Baldwin	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
855bc3d5698SJohn Baldwin	eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
856bc3d5698SJohn Baldwin	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
857bc3d5698SJohn Baldwin	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
858bc3d5698SJohn Baldwin	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
859bc3d5698SJohn Baldwin	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
860bc3d5698SJohn Baldwin	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
861bc3d5698SJohn Baldwin	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
862bc3d5698SJohn Baldwin	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
863bc3d5698SJohn Baldwin	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
864bc3d5698SJohn Baldwin
865bc3d5698SJohn Baldwin	// add in smeared stuff
866bc3d5698SJohn Baldwin	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
867bc3d5698SJohn Baldwin	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
868bc3d5698SJohn Baldwin	ret
869bc3d5698SJohn Baldwin.size	_vpaes_schedule_round,.-_vpaes_schedule_round
870bc3d5698SJohn Baldwin
871c0855eaaSJohn Baldwin//
872c0855eaaSJohn Baldwin//  .aes_schedule_transform
873c0855eaaSJohn Baldwin//
874c0855eaaSJohn Baldwin//  Linear-transform %xmm0 according to tables at (%r11)
875c0855eaaSJohn Baldwin//
876c0855eaaSJohn Baldwin//  Requires that %xmm9 = 0x0F0F... as in preheat
877c0855eaaSJohn Baldwin//  Output in %xmm0
878c0855eaaSJohn Baldwin//  Clobbers %xmm1, %xmm2
879c0855eaaSJohn Baldwin//
880bc3d5698SJohn Baldwin.type	_vpaes_schedule_transform,%function
881bc3d5698SJohn Baldwin.align	4
882bc3d5698SJohn Baldwin_vpaes_schedule_transform:
883bc3d5698SJohn Baldwin	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
884bc3d5698SJohn Baldwin	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
885bc3d5698SJohn Baldwin						// vmovdqa	(%r11),	%xmm2 	# lo
886bc3d5698SJohn Baldwin	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
887bc3d5698SJohn Baldwin						// vmovdqa	16(%r11),	%xmm1 # hi
888bc3d5698SJohn Baldwin	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
889bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
890bc3d5698SJohn Baldwin	ret
891bc3d5698SJohn Baldwin.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
892bc3d5698SJohn Baldwin
893c0855eaaSJohn Baldwin//
894c0855eaaSJohn Baldwin//  .aes_schedule_mangle
895c0855eaaSJohn Baldwin//
896c0855eaaSJohn Baldwin//  Mangle xmm0 from (basis-transformed) standard version
897c0855eaaSJohn Baldwin//  to our version.
898c0855eaaSJohn Baldwin//
899c0855eaaSJohn Baldwin//  On encrypt,
900c0855eaaSJohn Baldwin//    xor with 0x63
901c0855eaaSJohn Baldwin//    multiply by circulant 0,1,1,1
902c0855eaaSJohn Baldwin//    apply shiftrows transform
903c0855eaaSJohn Baldwin//
904c0855eaaSJohn Baldwin//  On decrypt,
905c0855eaaSJohn Baldwin//    xor with 0x63
906c0855eaaSJohn Baldwin//    multiply by "inverse mixcolumns" circulant E,B,D,9
907c0855eaaSJohn Baldwin//    deskew
908c0855eaaSJohn Baldwin//    apply shiftrows transform
909c0855eaaSJohn Baldwin//
910c0855eaaSJohn Baldwin//
911c0855eaaSJohn Baldwin//  Writes out to (%rdx), and increments or decrements it
912c0855eaaSJohn Baldwin//  Keeps track of round number mod 4 in %r8
913c0855eaaSJohn Baldwin//  Preserves xmm0
914c0855eaaSJohn Baldwin//  Clobbers xmm1-xmm5
915c0855eaaSJohn Baldwin//
916bc3d5698SJohn Baldwin.type	_vpaes_schedule_mangle,%function
917bc3d5698SJohn Baldwin.align	4
918bc3d5698SJohn Baldwin_vpaes_schedule_mangle:
919bc3d5698SJohn Baldwin	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
920bc3d5698SJohn Baldwin						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
921bc3d5698SJohn Baldwin	cbnz	w3, .Lschedule_mangle_dec
922bc3d5698SJohn Baldwin
923bc3d5698SJohn Baldwin	// encrypting
924bc3d5698SJohn Baldwin	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
925bc3d5698SJohn Baldwin	add	x2, x2, #16			// add	$16,	%rdx
926bc3d5698SJohn Baldwin	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
927bc3d5698SJohn Baldwin	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
928bc3d5698SJohn Baldwin	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
929bc3d5698SJohn Baldwin	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
930bc3d5698SJohn Baldwin	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
931bc3d5698SJohn Baldwin	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
932bc3d5698SJohn Baldwin
933bc3d5698SJohn Baldwin	b	.Lschedule_mangle_both
934bc3d5698SJohn Baldwin.align	4
935bc3d5698SJohn Baldwin.Lschedule_mangle_dec:
936bc3d5698SJohn Baldwin	// inverse mix columns
937bc3d5698SJohn Baldwin						// lea	.Lk_dksd(%rip),%r11
938bc3d5698SJohn Baldwin	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
939bc3d5698SJohn Baldwin	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
940bc3d5698SJohn Baldwin
941bc3d5698SJohn Baldwin						// vmovdqa	0x00(%r11),	%xmm2
942bc3d5698SJohn Baldwin	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
943bc3d5698SJohn Baldwin						// vmovdqa	0x10(%r11),	%xmm3
944bc3d5698SJohn Baldwin	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
945bc3d5698SJohn Baldwin	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
946bc3d5698SJohn Baldwin	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
947bc3d5698SJohn Baldwin
948bc3d5698SJohn Baldwin						// vmovdqa	0x20(%r11),	%xmm2
949bc3d5698SJohn Baldwin	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
950bc3d5698SJohn Baldwin	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
951bc3d5698SJohn Baldwin						// vmovdqa	0x30(%r11),	%xmm3
952bc3d5698SJohn Baldwin	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
953bc3d5698SJohn Baldwin	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
954bc3d5698SJohn Baldwin	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
955bc3d5698SJohn Baldwin
956bc3d5698SJohn Baldwin						// vmovdqa	0x40(%r11),	%xmm2
957bc3d5698SJohn Baldwin	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
958bc3d5698SJohn Baldwin	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
959bc3d5698SJohn Baldwin						// vmovdqa	0x50(%r11),	%xmm3
960bc3d5698SJohn Baldwin	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
961bc3d5698SJohn Baldwin	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
962bc3d5698SJohn Baldwin
963bc3d5698SJohn Baldwin						// vmovdqa	0x60(%r11),	%xmm2
964bc3d5698SJohn Baldwin	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
965bc3d5698SJohn Baldwin	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
966bc3d5698SJohn Baldwin						// vmovdqa	0x70(%r11),	%xmm4
967bc3d5698SJohn Baldwin	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
968bc3d5698SJohn Baldwin	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
969bc3d5698SJohn Baldwin	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
970bc3d5698SJohn Baldwin	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
971bc3d5698SJohn Baldwin
972bc3d5698SJohn Baldwin	sub	x2, x2, #16			// add	$-16,	%rdx
973bc3d5698SJohn Baldwin
974bc3d5698SJohn Baldwin.Lschedule_mangle_both:
975bc3d5698SJohn Baldwin	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
976bc3d5698SJohn Baldwin	add	x8, x8, #64-16			// add	$-16,	%r8
977bc3d5698SJohn Baldwin	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
978bc3d5698SJohn Baldwin	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
979bc3d5698SJohn Baldwin	ret
980bc3d5698SJohn Baldwin.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
981bc3d5698SJohn Baldwin
982bc3d5698SJohn Baldwin.globl	vpaes_set_encrypt_key
983bc3d5698SJohn Baldwin.type	vpaes_set_encrypt_key,%function
984bc3d5698SJohn Baldwin.align	4
985bc3d5698SJohn Baldwinvpaes_set_encrypt_key:
986bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
987bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-16]!
988bc3d5698SJohn Baldwin	add	x29,sp,#0
989bc3d5698SJohn Baldwin	stp	d8,d9,[sp,#-16]!	// ABI spec says so
990bc3d5698SJohn Baldwin
991bc3d5698SJohn Baldwin	lsr	w9, w1, #5		// shr	$5,%eax
992bc3d5698SJohn Baldwin	add	w9, w9, #5		// $5,%eax
993bc3d5698SJohn Baldwin	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
994bc3d5698SJohn Baldwin
995bc3d5698SJohn Baldwin	mov	w3, #0		// mov	$0,%ecx
996bc3d5698SJohn Baldwin	mov	x8, #0x30		// mov	$0x30,%r8d
997bc3d5698SJohn Baldwin	bl	_vpaes_schedule_core
998bc3d5698SJohn Baldwin	eor	x0, x0, x0
999bc3d5698SJohn Baldwin
1000bc3d5698SJohn Baldwin	ldp	d8,d9,[sp],#16
1001bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#16
1002bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
1003bc3d5698SJohn Baldwin	ret
1004bc3d5698SJohn Baldwin.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1005bc3d5698SJohn Baldwin
1006bc3d5698SJohn Baldwin.globl	vpaes_set_decrypt_key
1007bc3d5698SJohn Baldwin.type	vpaes_set_decrypt_key,%function
1008bc3d5698SJohn Baldwin.align	4
1009bc3d5698SJohn Baldwinvpaes_set_decrypt_key:
1010bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
1011bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-16]!
1012bc3d5698SJohn Baldwin	add	x29,sp,#0
1013bc3d5698SJohn Baldwin	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1014bc3d5698SJohn Baldwin
1015bc3d5698SJohn Baldwin	lsr	w9, w1, #5		// shr	$5,%eax
1016bc3d5698SJohn Baldwin	add	w9, w9, #5		// $5,%eax
1017bc3d5698SJohn Baldwin	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1018bc3d5698SJohn Baldwin	lsl	w9, w9, #4		// shl	$4,%eax
1019bc3d5698SJohn Baldwin	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
1020bc3d5698SJohn Baldwin	add	x2, x2, x9
1021bc3d5698SJohn Baldwin
1022bc3d5698SJohn Baldwin	mov	w3, #1		// mov	$1,%ecx
1023bc3d5698SJohn Baldwin	lsr	w8, w1, #1		// shr	$1,%r8d
1024bc3d5698SJohn Baldwin	and	x8, x8, #32		// and	$32,%r8d
1025bc3d5698SJohn Baldwin	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
1026bc3d5698SJohn Baldwin	bl	_vpaes_schedule_core
1027bc3d5698SJohn Baldwin
1028bc3d5698SJohn Baldwin	ldp	d8,d9,[sp],#16
1029bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#16
1030bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
1031bc3d5698SJohn Baldwin	ret
1032bc3d5698SJohn Baldwin.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1033bc3d5698SJohn Baldwin.globl	vpaes_cbc_encrypt
1034bc3d5698SJohn Baldwin.type	vpaes_cbc_encrypt,%function
1035bc3d5698SJohn Baldwin.align	4
1036bc3d5698SJohn Baldwinvpaes_cbc_encrypt:
1037bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
1038bc3d5698SJohn Baldwin	cbz	x2, .Lcbc_abort
1039bc3d5698SJohn Baldwin	cmp	w5, #0			// check direction
1040bc3d5698SJohn Baldwin	b.eq	vpaes_cbc_decrypt
1041bc3d5698SJohn Baldwin
1042bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-16]!
1043bc3d5698SJohn Baldwin	add	x29,sp,#0
1044bc3d5698SJohn Baldwin
1045bc3d5698SJohn Baldwin	mov	x17, x2		// reassign
1046bc3d5698SJohn Baldwin	mov	x2,  x3		// reassign
1047bc3d5698SJohn Baldwin
1048bc3d5698SJohn Baldwin	ld1	{v0.16b}, [x4]	// load ivec
1049bc3d5698SJohn Baldwin	bl	_vpaes_encrypt_preheat
1050bc3d5698SJohn Baldwin	b	.Lcbc_enc_loop
1051bc3d5698SJohn Baldwin
1052bc3d5698SJohn Baldwin.align	4
1053bc3d5698SJohn Baldwin.Lcbc_enc_loop:
1054bc3d5698SJohn Baldwin	ld1	{v7.16b}, [x0],#16	// load input
1055bc3d5698SJohn Baldwin	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
1056bc3d5698SJohn Baldwin	bl	_vpaes_encrypt_core
1057bc3d5698SJohn Baldwin	st1	{v0.16b}, [x1],#16	// save output
1058bc3d5698SJohn Baldwin	subs	x17, x17, #16
1059bc3d5698SJohn Baldwin	b.hi	.Lcbc_enc_loop
1060bc3d5698SJohn Baldwin
1061bc3d5698SJohn Baldwin	st1	{v0.16b}, [x4]	// write ivec
1062bc3d5698SJohn Baldwin
1063bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#16
1064bc3d5698SJohn Baldwin.Lcbc_abort:
1065bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
1066bc3d5698SJohn Baldwin	ret
1067bc3d5698SJohn Baldwin.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1068bc3d5698SJohn Baldwin
1069bc3d5698SJohn Baldwin.type	vpaes_cbc_decrypt,%function
1070bc3d5698SJohn Baldwin.align	4
1071bc3d5698SJohn Baldwinvpaes_cbc_decrypt:
1072bd9588bcSAndrew Turner	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
1073bd9588bcSAndrew Turner	// only from vpaes_cbc_encrypt which has already signed the return address.
1074bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-16]!
1075bc3d5698SJohn Baldwin	add	x29,sp,#0
1076bc3d5698SJohn Baldwin	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1077bc3d5698SJohn Baldwin	stp	d10,d11,[sp,#-16]!
1078bc3d5698SJohn Baldwin	stp	d12,d13,[sp,#-16]!
1079bc3d5698SJohn Baldwin	stp	d14,d15,[sp,#-16]!
1080bc3d5698SJohn Baldwin
1081bc3d5698SJohn Baldwin	mov	x17, x2		// reassign
1082bc3d5698SJohn Baldwin	mov	x2,  x3		// reassign
1083bc3d5698SJohn Baldwin	ld1	{v6.16b}, [x4]	// load ivec
1084bc3d5698SJohn Baldwin	bl	_vpaes_decrypt_preheat
1085bc3d5698SJohn Baldwin	tst	x17, #16
1086bc3d5698SJohn Baldwin	b.eq	.Lcbc_dec_loop2x
1087bc3d5698SJohn Baldwin
1088bc3d5698SJohn Baldwin	ld1	{v7.16b}, [x0], #16	// load input
1089bc3d5698SJohn Baldwin	bl	_vpaes_decrypt_core
1090bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1091bc3d5698SJohn Baldwin	orr	v6.16b, v7.16b, v7.16b	// next ivec value
1092bc3d5698SJohn Baldwin	st1	{v0.16b}, [x1], #16
1093bc3d5698SJohn Baldwin	subs	x17, x17, #16
1094bc3d5698SJohn Baldwin	b.ls	.Lcbc_dec_done
1095bc3d5698SJohn Baldwin
1096bc3d5698SJohn Baldwin.align	4
1097bc3d5698SJohn Baldwin.Lcbc_dec_loop2x:
1098bc3d5698SJohn Baldwin	ld1	{v14.16b,v15.16b}, [x0], #32
1099bc3d5698SJohn Baldwin	bl	_vpaes_decrypt_2x
1100bc3d5698SJohn Baldwin	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1101bc3d5698SJohn Baldwin	eor	v1.16b, v1.16b, v14.16b
1102bc3d5698SJohn Baldwin	orr	v6.16b, v15.16b, v15.16b
1103bc3d5698SJohn Baldwin	st1	{v0.16b,v1.16b}, [x1], #32
1104bc3d5698SJohn Baldwin	subs	x17, x17, #32
1105bc3d5698SJohn Baldwin	b.hi	.Lcbc_dec_loop2x
1106bc3d5698SJohn Baldwin
1107bc3d5698SJohn Baldwin.Lcbc_dec_done:
1108bc3d5698SJohn Baldwin	st1	{v6.16b}, [x4]
1109bc3d5698SJohn Baldwin
1110bc3d5698SJohn Baldwin	ldp	d14,d15,[sp],#16
1111bc3d5698SJohn Baldwin	ldp	d12,d13,[sp],#16
1112bc3d5698SJohn Baldwin	ldp	d10,d11,[sp],#16
1113bc3d5698SJohn Baldwin	ldp	d8,d9,[sp],#16
1114bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#16
1115bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
1116bc3d5698SJohn Baldwin	ret
1117bc3d5698SJohn Baldwin.size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1118bc3d5698SJohn Baldwin.globl	vpaes_ecb_encrypt
1119bc3d5698SJohn Baldwin.type	vpaes_ecb_encrypt,%function
1120bc3d5698SJohn Baldwin.align	4
1121bc3d5698SJohn Baldwinvpaes_ecb_encrypt:
1122bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
1123bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-16]!
1124bc3d5698SJohn Baldwin	add	x29,sp,#0
1125bc3d5698SJohn Baldwin	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1126bc3d5698SJohn Baldwin	stp	d10,d11,[sp,#-16]!
1127bc3d5698SJohn Baldwin	stp	d12,d13,[sp,#-16]!
1128bc3d5698SJohn Baldwin	stp	d14,d15,[sp,#-16]!
1129bc3d5698SJohn Baldwin
1130bc3d5698SJohn Baldwin	mov	x17, x2
1131bc3d5698SJohn Baldwin	mov	x2,  x3
1132bc3d5698SJohn Baldwin	bl	_vpaes_encrypt_preheat
1133bc3d5698SJohn Baldwin	tst	x17, #16
1134bc3d5698SJohn Baldwin	b.eq	.Lecb_enc_loop
1135bc3d5698SJohn Baldwin
1136bc3d5698SJohn Baldwin	ld1	{v7.16b}, [x0],#16
1137bc3d5698SJohn Baldwin	bl	_vpaes_encrypt_core
1138bc3d5698SJohn Baldwin	st1	{v0.16b}, [x1],#16
1139bc3d5698SJohn Baldwin	subs	x17, x17, #16
1140bc3d5698SJohn Baldwin	b.ls	.Lecb_enc_done
1141bc3d5698SJohn Baldwin
1142bc3d5698SJohn Baldwin.align	4
1143bc3d5698SJohn Baldwin.Lecb_enc_loop:
1144bc3d5698SJohn Baldwin	ld1	{v14.16b,v15.16b}, [x0], #32
1145bc3d5698SJohn Baldwin	bl	_vpaes_encrypt_2x
1146bc3d5698SJohn Baldwin	st1	{v0.16b,v1.16b}, [x1], #32
1147bc3d5698SJohn Baldwin	subs	x17, x17, #32
1148bc3d5698SJohn Baldwin	b.hi	.Lecb_enc_loop
1149bc3d5698SJohn Baldwin
1150bc3d5698SJohn Baldwin.Lecb_enc_done:
1151bc3d5698SJohn Baldwin	ldp	d14,d15,[sp],#16
1152bc3d5698SJohn Baldwin	ldp	d12,d13,[sp],#16
1153bc3d5698SJohn Baldwin	ldp	d10,d11,[sp],#16
1154bc3d5698SJohn Baldwin	ldp	d8,d9,[sp],#16
1155bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#16
1156bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
1157bc3d5698SJohn Baldwin	ret
1158bc3d5698SJohn Baldwin.size	vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1159bc3d5698SJohn Baldwin
1160bc3d5698SJohn Baldwin.globl	vpaes_ecb_decrypt
1161bc3d5698SJohn Baldwin.type	vpaes_ecb_decrypt,%function
1162bc3d5698SJohn Baldwin.align	4
1163bc3d5698SJohn Baldwinvpaes_ecb_decrypt:
1164bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
1165bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-16]!
1166bc3d5698SJohn Baldwin	add	x29,sp,#0
1167bc3d5698SJohn Baldwin	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1168bc3d5698SJohn Baldwin	stp	d10,d11,[sp,#-16]!
1169bc3d5698SJohn Baldwin	stp	d12,d13,[sp,#-16]!
1170bc3d5698SJohn Baldwin	stp	d14,d15,[sp,#-16]!
1171bc3d5698SJohn Baldwin
1172bc3d5698SJohn Baldwin	mov	x17, x2
1173bc3d5698SJohn Baldwin	mov	x2,  x3
1174bc3d5698SJohn Baldwin	bl	_vpaes_decrypt_preheat
1175bc3d5698SJohn Baldwin	tst	x17, #16
1176bc3d5698SJohn Baldwin	b.eq	.Lecb_dec_loop
1177bc3d5698SJohn Baldwin
1178bc3d5698SJohn Baldwin	ld1	{v7.16b}, [x0],#16
1179bc3d5698SJohn Baldwin	bl	_vpaes_encrypt_core
1180bc3d5698SJohn Baldwin	st1	{v0.16b}, [x1],#16
1181bc3d5698SJohn Baldwin	subs	x17, x17, #16
1182bc3d5698SJohn Baldwin	b.ls	.Lecb_dec_done
1183bc3d5698SJohn Baldwin
1184bc3d5698SJohn Baldwin.align	4
1185bc3d5698SJohn Baldwin.Lecb_dec_loop:
1186bc3d5698SJohn Baldwin	ld1	{v14.16b,v15.16b}, [x0], #32
1187bc3d5698SJohn Baldwin	bl	_vpaes_decrypt_2x
1188bc3d5698SJohn Baldwin	st1	{v0.16b,v1.16b}, [x1], #32
1189bc3d5698SJohn Baldwin	subs	x17, x17, #32
1190bc3d5698SJohn Baldwin	b.hi	.Lecb_dec_loop
1191bc3d5698SJohn Baldwin
1192bc3d5698SJohn Baldwin.Lecb_dec_done:
1193bc3d5698SJohn Baldwin	ldp	d14,d15,[sp],#16
1194bc3d5698SJohn Baldwin	ldp	d12,d13,[sp],#16
1195bc3d5698SJohn Baldwin	ldp	d10,d11,[sp],#16
1196bc3d5698SJohn Baldwin	ldp	d8,d9,[sp],#16
1197bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#16
1198bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
1199bc3d5698SJohn Baldwin	ret
1200bc3d5698SJohn Baldwin.size	vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
1201