xref: /linux/arch/arm64/crypto/aes-neonbs-core.S (revision 44f57d78)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Bit sliced AES using NEON instructions
4 *
5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/*
9 * The algorithm implemented here is described in detail by the paper
10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
12 *
13 * This implementation is based primarily on the OpenSSL implementation
14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
15 */
16
17#include <linux/linkage.h>
18#include <asm/assembler.h>
19
20	.text
21
22	rounds		.req	x11
23	bskey		.req	x12
24
25	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
26	eor		\b2, \b2, \b1
27	eor		\b5, \b5, \b6
28	eor		\b3, \b3, \b0
29	eor		\b6, \b6, \b2
30	eor		\b5, \b5, \b0
31	eor		\b6, \b6, \b3
32	eor		\b3, \b3, \b7
33	eor		\b7, \b7, \b5
34	eor		\b3, \b3, \b4
35	eor		\b4, \b4, \b5
36	eor		\b2, \b2, \b7
37	eor		\b3, \b3, \b1
38	eor		\b1, \b1, \b5
39	.endm
40
41	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
42	eor		\b0, \b0, \b6
43	eor		\b1, \b1, \b4
44	eor		\b4, \b4, \b6
45	eor		\b2, \b2, \b0
46	eor		\b6, \b6, \b1
47	eor		\b1, \b1, \b5
48	eor		\b5, \b5, \b3
49	eor		\b3, \b3, \b7
50	eor		\b7, \b7, \b5
51	eor		\b2, \b2, \b5
52	eor		\b4, \b4, \b7
53	.endm
54
55	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
56	eor		\b1, \b1, \b7
57	eor		\b4, \b4, \b7
58	eor		\b7, \b7, \b5
59	eor		\b1, \b1, \b3
60	eor		\b2, \b2, \b5
61	eor		\b3, \b3, \b7
62	eor		\b6, \b6, \b1
63	eor		\b2, \b2, \b0
64	eor		\b5, \b5, \b3
65	eor		\b4, \b4, \b6
66	eor		\b0, \b0, \b6
67	eor		\b1, \b1, \b4
68	.endm
69
70	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
71	eor		\b1, \b1, \b5
72	eor		\b2, \b2, \b7
73	eor		\b3, \b3, \b1
74	eor		\b4, \b4, \b5
75	eor		\b7, \b7, \b5
76	eor		\b3, \b3, \b4
77	eor 		\b5, \b5, \b0
78	eor		\b3, \b3, \b7
79	eor		\b6, \b6, \b2
80	eor		\b2, \b2, \b1
81	eor		\b6, \b6, \b3
82	eor		\b3, \b3, \b0
83	eor		\b5, \b5, \b6
84	.endm
85
86	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
87	eor 		\t0, \y0, \y1
88	and		\t0, \t0, \x0
89	eor		\x0, \x0, \x1
90	and		\t1, \x1, \y0
91	and		\x0, \x0, \y1
92	eor		\x1, \t1, \t0
93	eor		\x0, \x0, \t1
94	.endm
95
96	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
97	eor		\t0, \y0, \y1
98	eor 		\t1, \y2, \y3
99	and		\t0, \t0, \x0
100	and		\t1, \t1, \x2
101	eor		\x0, \x0, \x1
102	eor		\x2, \x2, \x3
103	and		\x1, \x1, \y0
104	and		\x3, \x3, \y2
105	and		\x0, \x0, \y1
106	and		\x2, \x2, \y3
107	eor		\x1, \x1, \x0
108	eor		\x2, \x2, \x3
109	eor		\x0, \x0, \t0
110	eor		\x3, \x3, \t1
111	.endm
112
113	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
114				    y0, y1, y2, y3, t0, t1, t2, t3
115	eor		\t0, \x0, \x2
116	eor		\t1, \x1, \x3
117	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
118	eor		\y0, \y0, \y2
119	eor		\y1, \y1, \y3
120	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
121	eor		\x0, \x0, \t0
122	eor		\x2, \x2, \t0
123	eor		\x1, \x1, \t1
124	eor		\x3, \x3, \t1
125	eor		\t0, \x4, \x6
126	eor		\t1, \x5, \x7
127	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
128	eor		\y0, \y0, \y2
129	eor		\y1, \y1, \y3
130	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
131	eor		\x4, \x4, \t0
132	eor		\x6, \x6, \t0
133	eor		\x5, \x5, \t1
134	eor		\x7, \x7, \t1
135	.endm
136
137	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
138				   t0, t1, t2, t3, s0, s1, s2, s3
139	eor		\t3, \x4, \x6
140	eor		\t0, \x5, \x7
141	eor		\t1, \x1, \x3
142	eor		\s1, \x7, \x6
143	eor		\s0, \x0, \x2
144	eor		\s3, \t3, \t0
145	orr		\t2, \t0, \t1
146	and		\s2, \t3, \s0
147	orr		\t3, \t3, \s0
148	eor		\s0, \s0, \t1
149	and		\t0, \t0, \t1
150	eor		\t1, \x3, \x2
151	and		\s3, \s3, \s0
152	and		\s1, \s1, \t1
153	eor		\t1, \x4, \x5
154	eor		\s0, \x1, \x0
155	eor		\t3, \t3, \s1
156	eor		\t2, \t2, \s1
157	and		\s1, \t1, \s0
158	orr		\t1, \t1, \s0
159	eor		\t3, \t3, \s3
160	eor		\t0, \t0, \s1
161	eor		\t2, \t2, \s2
162	eor		\t1, \t1, \s3
163	eor		\t0, \t0, \s2
164	and		\s0, \x7, \x3
165	eor		\t1, \t1, \s2
166	and		\s1, \x6, \x2
167	and		\s2, \x5, \x1
168	orr		\s3, \x4, \x0
169	eor		\t3, \t3, \s0
170	eor		\t1, \t1, \s2
171	eor		\s0, \t0, \s3
172	eor		\t2, \t2, \s1
173	and		\s2, \t3, \t1
174	eor		\s1, \t2, \s2
175	eor		\s3, \s0, \s2
176	bsl		\s1, \t1, \s0
177	not		\t0, \s0
178	bsl		\s0, \s1, \s3
179	bsl		\t0, \s1, \s3
180	bsl		\s3, \t3, \t2
181	eor		\t3, \t3, \t2
182	and		\s2, \s0, \s3
183	eor		\t1, \t1, \t0
184	eor		\s2, \s2, \t3
185	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
186			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
187	.endm
188
189	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
190			      t0, t1, t2, t3, s0, s1, s2, s3
191	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
192			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
193	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
194			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
195			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
196			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
197	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
198			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
199	.endm
200
201	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
202				  t0, t1, t2, t3, s0, s1, s2, s3
203	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
204			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
205	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
206			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
207			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
208			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
209	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
210			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
211	.endm
212
213	.macro		enc_next_rk
214	ldp		q16, q17, [bskey], #128
215	ldp		q18, q19, [bskey, #-96]
216	ldp		q20, q21, [bskey, #-64]
217	ldp		q22, q23, [bskey, #-32]
218	.endm
219
220	.macro		dec_next_rk
221	ldp		q16, q17, [bskey, #-128]!
222	ldp		q18, q19, [bskey, #32]
223	ldp		q20, q21, [bskey, #64]
224	ldp		q22, q23, [bskey, #96]
225	.endm
226
227	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
228	eor		\x0\().16b, \x0\().16b, v16.16b
229	eor		\x1\().16b, \x1\().16b, v17.16b
230	eor		\x2\().16b, \x2\().16b, v18.16b
231	eor		\x3\().16b, \x3\().16b, v19.16b
232	eor		\x4\().16b, \x4\().16b, v20.16b
233	eor		\x5\().16b, \x5\().16b, v21.16b
234	eor		\x6\().16b, \x6\().16b, v22.16b
235	eor		\x7\().16b, \x7\().16b, v23.16b
236	.endm
237
238	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
239	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
240	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
241	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
242	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
243	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
244	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
245	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
246	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
247	.endm
248
249	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
250				  t0, t1, t2, t3, t4, t5, t6, t7, inv
251	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
252	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
253	eor		\x0\().16b, \x0\().16b, \t0\().16b
254	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
255	eor		\x1\().16b, \x1\().16b, \t1\().16b
256	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
257	eor		\x2\().16b, \x2\().16b, \t2\().16b
258	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
259	eor		\x3\().16b, \x3\().16b, \t3\().16b
260	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
261	eor		\x4\().16b, \x4\().16b, \t4\().16b
262	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
263	eor		\x5\().16b, \x5\().16b, \t5\().16b
264	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
265	eor		\x6\().16b, \x6\().16b, \t6\().16b
266	eor		\t1\().16b, \t1\().16b, \x0\().16b
267	eor		\x7\().16b, \x7\().16b, \t7\().16b
268	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
269	eor		\t2\().16b, \t2\().16b, \x1\().16b
270	eor		\t0\().16b, \t0\().16b, \x7\().16b
271	eor		\t1\().16b, \t1\().16b, \x7\().16b
272	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
273	eor		\t5\().16b, \t5\().16b, \x4\().16b
274	eor		\x0\().16b, \x0\().16b, \t0\().16b
275	eor		\t6\().16b, \t6\().16b, \x5\().16b
276	eor		\x1\().16b, \x1\().16b, \t1\().16b
277	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
278	eor		\t4\().16b, \t4\().16b, \x3\().16b
279	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
280	eor		\t7\().16b, \t7\().16b, \x6\().16b
281	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
282	eor		\t3\().16b, \t3\().16b, \x2\().16b
283	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
284	eor		\t4\().16b, \t4\().16b, \x7\().16b
285	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
286	eor		\t3\().16b, \t3\().16b, \x7\().16b
287	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
288	eor		\x7\().16b, \t1\().16b, \t5\().16b
289	.ifb		\inv
290	eor		\x2\().16b, \t0\().16b, \t4\().16b
291	eor		\x4\().16b, \x4\().16b, \t3\().16b
292	eor		\x5\().16b, \x5\().16b, \t7\().16b
293	eor		\x3\().16b, \x3\().16b, \t6\().16b
294	eor		\x6\().16b, \x6\().16b, \t2\().16b
295	.else
296	eor		\t3\().16b, \t3\().16b, \x4\().16b
297	eor		\x5\().16b, \x5\().16b, \t7\().16b
298	eor		\x2\().16b, \x3\().16b, \t6\().16b
299	eor		\x3\().16b, \t0\().16b, \t4\().16b
300	eor		\x4\().16b, \x6\().16b, \t2\().16b
301	mov		\x6\().16b, \t3\().16b
302	.endif
303	.endm
304
305	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
306				      t0, t1, t2, t3, t4, t5, t6, t7
307	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
308	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
309	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
310	eor		\t0\().16b, \t0\().16b, \x0\().16b
311	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
312	eor		\t6\().16b, \t6\().16b, \x6\().16b
313	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
314	eor		\t7\().16b, \t7\().16b, \x7\().16b
315	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
316	eor		\t1\().16b, \t1\().16b, \x1\().16b
317	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
318	eor		\t2\().16b, \t2\().16b, \x2\().16b
319	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
320	eor		\t3\().16b, \t3\().16b, \x3\().16b
321	eor		\t4\().16b, \t4\().16b, \x4\().16b
322	eor		\t5\().16b, \t5\().16b, \x5\().16b
323	eor		\x0\().16b, \x0\().16b, \t6\().16b
324	eor		\x1\().16b, \x1\().16b, \t6\().16b
325	eor		\x2\().16b, \x2\().16b, \t0\().16b
326	eor		\x4\().16b, \x4\().16b, \t2\().16b
327	eor		\x3\().16b, \x3\().16b, \t1\().16b
328	eor		\x1\().16b, \x1\().16b, \t7\().16b
329	eor		\x2\().16b, \x2\().16b, \t7\().16b
330	eor		\x4\().16b, \x4\().16b, \t6\().16b
331	eor		\x5\().16b, \x5\().16b, \t3\().16b
332	eor		\x3\().16b, \x3\().16b, \t6\().16b
333	eor		\x6\().16b, \x6\().16b, \t4\().16b
334	eor		\x4\().16b, \x4\().16b, \t7\().16b
335	eor		\x5\().16b, \x5\().16b, \t7\().16b
336	eor		\x7\().16b, \x7\().16b, \t5\().16b
337	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
338			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
339	.endm
340
341	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
342	ushr		\t0\().2d, \b0\().2d, #\n
343	ushr		\t1\().2d, \b1\().2d, #\n
344	eor		\t0\().16b, \t0\().16b, \a0\().16b
345	eor		\t1\().16b, \t1\().16b, \a1\().16b
346	and		\t0\().16b, \t0\().16b, \mask\().16b
347	and		\t1\().16b, \t1\().16b, \mask\().16b
348	eor		\a0\().16b, \a0\().16b, \t0\().16b
349	shl		\t0\().2d, \t0\().2d, #\n
350	eor		\a1\().16b, \a1\().16b, \t1\().16b
351	shl		\t1\().2d, \t1\().2d, #\n
352	eor		\b0\().16b, \b0\().16b, \t0\().16b
353	eor		\b1\().16b, \b1\().16b, \t1\().16b
354	.endm
355
356	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
357	movi		\t0\().16b, #0x55
358	movi		\t1\().16b, #0x33
359	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
360	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
361	movi		\t0\().16b, #0x0f
362	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
363	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
364	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
365	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
366	.endm
367
368
369	.align		6
370M0:	.octa		0x0004080c0105090d02060a0e03070b0f
371
372M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
373SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
374SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
375
376M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
377ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
378ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
379
380	/*
381	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
382	 */
383ENTRY(aesbs_convert_key)
384	ld1		{v7.4s}, [x1], #16		// load round 0 key
385	ld1		{v17.4s}, [x1], #16		// load round 1 key
386
387	movi		v8.16b,  #0x01			// bit masks
388	movi		v9.16b,  #0x02
389	movi		v10.16b, #0x04
390	movi		v11.16b, #0x08
391	movi		v12.16b, #0x10
392	movi		v13.16b, #0x20
393	movi		v14.16b, #0x40
394	movi		v15.16b, #0x80
395	ldr		q16, M0
396
397	sub		x2, x2, #1
398	str		q7, [x0], #16		// save round 0 key
399
400.Lkey_loop:
401	tbl		v7.16b ,{v17.16b}, v16.16b
402	ld1		{v17.4s}, [x1], #16		// load next round key
403
404	cmtst		v0.16b, v7.16b, v8.16b
405	cmtst		v1.16b, v7.16b, v9.16b
406	cmtst		v2.16b, v7.16b, v10.16b
407	cmtst		v3.16b, v7.16b, v11.16b
408	cmtst		v4.16b, v7.16b, v12.16b
409	cmtst		v5.16b, v7.16b, v13.16b
410	cmtst		v6.16b, v7.16b, v14.16b
411	cmtst		v7.16b, v7.16b, v15.16b
412	not		v0.16b, v0.16b
413	not		v1.16b, v1.16b
414	not		v5.16b, v5.16b
415	not		v6.16b, v6.16b
416
417	subs		x2, x2, #1
418	stp		q0, q1, [x0], #128
419	stp		q2, q3, [x0, #-96]
420	stp		q4, q5, [x0, #-64]
421	stp		q6, q7, [x0, #-32]
422	b.ne		.Lkey_loop
423
424	movi		v7.16b, #0x63			// compose .L63
425	eor		v17.16b, v17.16b, v7.16b
426	str		q17, [x0]
427	ret
428ENDPROC(aesbs_convert_key)
429
430	.align		4
431aesbs_encrypt8:
432	ldr		q9, [bskey], #16		// round 0 key
433	ldr		q8, M0SR
434	ldr		q24, SR
435
436	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
437	eor		v11.16b, v1.16b, v9.16b
438	tbl		v0.16b, {v10.16b}, v8.16b
439	eor		v12.16b, v2.16b, v9.16b
440	tbl		v1.16b, {v11.16b}, v8.16b
441	eor		v13.16b, v3.16b, v9.16b
442	tbl		v2.16b, {v12.16b}, v8.16b
443	eor		v14.16b, v4.16b, v9.16b
444	tbl		v3.16b, {v13.16b}, v8.16b
445	eor		v15.16b, v5.16b, v9.16b
446	tbl		v4.16b, {v14.16b}, v8.16b
447	eor		v10.16b, v6.16b, v9.16b
448	tbl		v5.16b, {v15.16b}, v8.16b
449	eor		v11.16b, v7.16b, v9.16b
450	tbl		v6.16b, {v10.16b}, v8.16b
451	tbl		v7.16b, {v11.16b}, v8.16b
452
453	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
454
455	sub		rounds, rounds, #1
456	b		.Lenc_sbox
457
458.Lenc_loop:
459	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
460.Lenc_sbox:
461	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
462								v13, v14, v15
463	subs		rounds, rounds, #1
464	b.cc		.Lenc_done
465
466	enc_next_rk
467
468	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
469								v13, v14, v15
470
471	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
472
473	b.ne		.Lenc_loop
474	ldr		q24, SRM0
475	b		.Lenc_loop
476
477.Lenc_done:
478	ldr		q12, [bskey]			// last round key
479
480	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
481
482	eor		v0.16b, v0.16b, v12.16b
483	eor		v1.16b, v1.16b, v12.16b
484	eor		v4.16b, v4.16b, v12.16b
485	eor		v6.16b, v6.16b, v12.16b
486	eor		v3.16b, v3.16b, v12.16b
487	eor		v7.16b, v7.16b, v12.16b
488	eor		v2.16b, v2.16b, v12.16b
489	eor		v5.16b, v5.16b, v12.16b
490	ret
491ENDPROC(aesbs_encrypt8)
492
493	.align		4
494aesbs_decrypt8:
495	lsl		x9, rounds, #7
496	add		bskey, bskey, x9
497
498	ldr		q9, [bskey, #-112]!		// round 0 key
499	ldr		q8, M0ISR
500	ldr		q24, ISR
501
502	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
503	eor		v11.16b, v1.16b, v9.16b
504	tbl		v0.16b, {v10.16b}, v8.16b
505	eor		v12.16b, v2.16b, v9.16b
506	tbl		v1.16b, {v11.16b}, v8.16b
507	eor		v13.16b, v3.16b, v9.16b
508	tbl		v2.16b, {v12.16b}, v8.16b
509	eor		v14.16b, v4.16b, v9.16b
510	tbl		v3.16b, {v13.16b}, v8.16b
511	eor		v15.16b, v5.16b, v9.16b
512	tbl		v4.16b, {v14.16b}, v8.16b
513	eor		v10.16b, v6.16b, v9.16b
514	tbl		v5.16b, {v15.16b}, v8.16b
515	eor		v11.16b, v7.16b, v9.16b
516	tbl		v6.16b, {v10.16b}, v8.16b
517	tbl		v7.16b, {v11.16b}, v8.16b
518
519	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
520
521	sub		rounds, rounds, #1
522	b		.Ldec_sbox
523
524.Ldec_loop:
525	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
526.Ldec_sbox:
527	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
528								v13, v14, v15
529	subs		rounds, rounds, #1
530	b.cc		.Ldec_done
531
532	dec_next_rk
533
534	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
535
536	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
537								v13, v14, v15
538
539	b.ne		.Ldec_loop
540	ldr		q24, ISRM0
541	b		.Ldec_loop
542.Ldec_done:
543	ldr		q12, [bskey, #-16]		// last round key
544
545	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
546
547	eor		v0.16b, v0.16b, v12.16b
548	eor		v1.16b, v1.16b, v12.16b
549	eor		v6.16b, v6.16b, v12.16b
550	eor		v4.16b, v4.16b, v12.16b
551	eor		v2.16b, v2.16b, v12.16b
552	eor		v7.16b, v7.16b, v12.16b
553	eor		v3.16b, v3.16b, v12.16b
554	eor		v5.16b, v5.16b, v12.16b
555	ret
556ENDPROC(aesbs_decrypt8)
557
558	/*
559	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
560	 *		     int blocks)
561	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
562	 *		     int blocks)
563	 */
564	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
565	frame_push	5
566
567	mov		x19, x0
568	mov		x20, x1
569	mov		x21, x2
570	mov		x22, x3
571	mov		x23, x4
572
57399:	mov		x5, #1
574	lsl		x5, x5, x23
575	subs		w23, w23, #8
576	csel		x23, x23, xzr, pl
577	csel		x5, x5, xzr, mi
578
579	ld1		{v0.16b}, [x20], #16
580	tbnz		x5, #1, 0f
581	ld1		{v1.16b}, [x20], #16
582	tbnz		x5, #2, 0f
583	ld1		{v2.16b}, [x20], #16
584	tbnz		x5, #3, 0f
585	ld1		{v3.16b}, [x20], #16
586	tbnz		x5, #4, 0f
587	ld1		{v4.16b}, [x20], #16
588	tbnz		x5, #5, 0f
589	ld1		{v5.16b}, [x20], #16
590	tbnz		x5, #6, 0f
591	ld1		{v6.16b}, [x20], #16
592	tbnz		x5, #7, 0f
593	ld1		{v7.16b}, [x20], #16
594
5950:	mov		bskey, x21
596	mov		rounds, x22
597	bl		\do8
598
599	st1		{\o0\().16b}, [x19], #16
600	tbnz		x5, #1, 1f
601	st1		{\o1\().16b}, [x19], #16
602	tbnz		x5, #2, 1f
603	st1		{\o2\().16b}, [x19], #16
604	tbnz		x5, #3, 1f
605	st1		{\o3\().16b}, [x19], #16
606	tbnz		x5, #4, 1f
607	st1		{\o4\().16b}, [x19], #16
608	tbnz		x5, #5, 1f
609	st1		{\o5\().16b}, [x19], #16
610	tbnz		x5, #6, 1f
611	st1		{\o6\().16b}, [x19], #16
612	tbnz		x5, #7, 1f
613	st1		{\o7\().16b}, [x19], #16
614
615	cbz		x23, 1f
616	cond_yield_neon
617	b		99b
618
6191:	frame_pop
620	ret
621	.endm
622
623	.align		4
624ENTRY(aesbs_ecb_encrypt)
625	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
626ENDPROC(aesbs_ecb_encrypt)
627
628	.align		4
629ENTRY(aesbs_ecb_decrypt)
630	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
631ENDPROC(aesbs_ecb_decrypt)
632
633	/*
634	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
635	 *		     int blocks, u8 iv[])
636	 */
637	.align		4
638ENTRY(aesbs_cbc_decrypt)
639	frame_push	6
640
641	mov		x19, x0
642	mov		x20, x1
643	mov		x21, x2
644	mov		x22, x3
645	mov		x23, x4
646	mov		x24, x5
647
64899:	mov		x6, #1
649	lsl		x6, x6, x23
650	subs		w23, w23, #8
651	csel		x23, x23, xzr, pl
652	csel		x6, x6, xzr, mi
653
654	ld1		{v0.16b}, [x20], #16
655	mov		v25.16b, v0.16b
656	tbnz		x6, #1, 0f
657	ld1		{v1.16b}, [x20], #16
658	mov		v26.16b, v1.16b
659	tbnz		x6, #2, 0f
660	ld1		{v2.16b}, [x20], #16
661	mov		v27.16b, v2.16b
662	tbnz		x6, #3, 0f
663	ld1		{v3.16b}, [x20], #16
664	mov		v28.16b, v3.16b
665	tbnz		x6, #4, 0f
666	ld1		{v4.16b}, [x20], #16
667	mov		v29.16b, v4.16b
668	tbnz		x6, #5, 0f
669	ld1		{v5.16b}, [x20], #16
670	mov		v30.16b, v5.16b
671	tbnz		x6, #6, 0f
672	ld1		{v6.16b}, [x20], #16
673	mov		v31.16b, v6.16b
674	tbnz		x6, #7, 0f
675	ld1		{v7.16b}, [x20]
676
6770:	mov		bskey, x21
678	mov		rounds, x22
679	bl		aesbs_decrypt8
680
681	ld1		{v24.16b}, [x24]		// load IV
682
683	eor		v1.16b, v1.16b, v25.16b
684	eor		v6.16b, v6.16b, v26.16b
685	eor		v4.16b, v4.16b, v27.16b
686	eor		v2.16b, v2.16b, v28.16b
687	eor		v7.16b, v7.16b, v29.16b
688	eor		v0.16b, v0.16b, v24.16b
689	eor		v3.16b, v3.16b, v30.16b
690	eor		v5.16b, v5.16b, v31.16b
691
692	st1		{v0.16b}, [x19], #16
693	mov		v24.16b, v25.16b
694	tbnz		x6, #1, 1f
695	st1		{v1.16b}, [x19], #16
696	mov		v24.16b, v26.16b
697	tbnz		x6, #2, 1f
698	st1		{v6.16b}, [x19], #16
699	mov		v24.16b, v27.16b
700	tbnz		x6, #3, 1f
701	st1		{v4.16b}, [x19], #16
702	mov		v24.16b, v28.16b
703	tbnz		x6, #4, 1f
704	st1		{v2.16b}, [x19], #16
705	mov		v24.16b, v29.16b
706	tbnz		x6, #5, 1f
707	st1		{v7.16b}, [x19], #16
708	mov		v24.16b, v30.16b
709	tbnz		x6, #6, 1f
710	st1		{v3.16b}, [x19], #16
711	mov		v24.16b, v31.16b
712	tbnz		x6, #7, 1f
713	ld1		{v24.16b}, [x20], #16
714	st1		{v5.16b}, [x19], #16
7151:	st1		{v24.16b}, [x24]		// store IV
716
717	cbz		x23, 2f
718	cond_yield_neon
719	b		99b
720
7212:	frame_pop
722	ret
723ENDPROC(aesbs_cbc_decrypt)
724
725	.macro		next_tweak, out, in, const, tmp
726	sshr		\tmp\().2d,  \in\().2d,   #63
727	and		\tmp\().16b, \tmp\().16b, \const\().16b
728	add		\out\().2d,  \in\().2d,   \in\().2d
729	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
730	eor		\out\().16b, \out\().16b, \tmp\().16b
731	.endm
732
733	.align		4
734.Lxts_mul_x:
735CPU_LE(	.quad		1, 0x87		)
736CPU_BE(	.quad		0x87, 1		)
737
738	/*
739	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
740	 *		     int blocks, u8 iv[])
741	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
742	 *		     int blocks, u8 iv[])
743	 */
744__xts_crypt8:
745	mov		x6, #1
746	lsl		x6, x6, x23
747	subs		w23, w23, #8
748	csel		x23, x23, xzr, pl
749	csel		x6, x6, xzr, mi
750
751	ld1		{v0.16b}, [x20], #16
752	next_tweak	v26, v25, v30, v31
753	eor		v0.16b, v0.16b, v25.16b
754	tbnz		x6, #1, 0f
755
756	ld1		{v1.16b}, [x20], #16
757	next_tweak	v27, v26, v30, v31
758	eor		v1.16b, v1.16b, v26.16b
759	tbnz		x6, #2, 0f
760
761	ld1		{v2.16b}, [x20], #16
762	next_tweak	v28, v27, v30, v31
763	eor		v2.16b, v2.16b, v27.16b
764	tbnz		x6, #3, 0f
765
766	ld1		{v3.16b}, [x20], #16
767	next_tweak	v29, v28, v30, v31
768	eor		v3.16b, v3.16b, v28.16b
769	tbnz		x6, #4, 0f
770
771	ld1		{v4.16b}, [x20], #16
772	str		q29, [sp, #.Lframe_local_offset]
773	eor		v4.16b, v4.16b, v29.16b
774	next_tweak	v29, v29, v30, v31
775	tbnz		x6, #5, 0f
776
777	ld1		{v5.16b}, [x20], #16
778	str		q29, [sp, #.Lframe_local_offset + 16]
779	eor		v5.16b, v5.16b, v29.16b
780	next_tweak	v29, v29, v30, v31
781	tbnz		x6, #6, 0f
782
783	ld1		{v6.16b}, [x20], #16
784	str		q29, [sp, #.Lframe_local_offset + 32]
785	eor		v6.16b, v6.16b, v29.16b
786	next_tweak	v29, v29, v30, v31
787	tbnz		x6, #7, 0f
788
789	ld1		{v7.16b}, [x20], #16
790	str		q29, [sp, #.Lframe_local_offset + 48]
791	eor		v7.16b, v7.16b, v29.16b
792	next_tweak	v29, v29, v30, v31
793
7940:	mov		bskey, x21
795	mov		rounds, x22
796	br		x7
797ENDPROC(__xts_crypt8)
798
799	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
800	frame_push	6, 64
801
802	mov		x19, x0
803	mov		x20, x1
804	mov		x21, x2
805	mov		x22, x3
806	mov		x23, x4
807	mov		x24, x5
808
8090:	ldr		q30, .Lxts_mul_x
810	ld1		{v25.16b}, [x24]
811
81299:	adr		x7, \do8
813	bl		__xts_crypt8
814
815	ldp		q16, q17, [sp, #.Lframe_local_offset]
816	ldp		q18, q19, [sp, #.Lframe_local_offset + 32]
817
818	eor		\o0\().16b, \o0\().16b, v25.16b
819	eor		\o1\().16b, \o1\().16b, v26.16b
820	eor		\o2\().16b, \o2\().16b, v27.16b
821	eor		\o3\().16b, \o3\().16b, v28.16b
822
823	st1		{\o0\().16b}, [x19], #16
824	mov		v25.16b, v26.16b
825	tbnz		x6, #1, 1f
826	st1		{\o1\().16b}, [x19], #16
827	mov		v25.16b, v27.16b
828	tbnz		x6, #2, 1f
829	st1		{\o2\().16b}, [x19], #16
830	mov		v25.16b, v28.16b
831	tbnz		x6, #3, 1f
832	st1		{\o3\().16b}, [x19], #16
833	mov		v25.16b, v29.16b
834	tbnz		x6, #4, 1f
835
836	eor		\o4\().16b, \o4\().16b, v16.16b
837	eor		\o5\().16b, \o5\().16b, v17.16b
838	eor		\o6\().16b, \o6\().16b, v18.16b
839	eor		\o7\().16b, \o7\().16b, v19.16b
840
841	st1		{\o4\().16b}, [x19], #16
842	tbnz		x6, #5, 1f
843	st1		{\o5\().16b}, [x19], #16
844	tbnz		x6, #6, 1f
845	st1		{\o6\().16b}, [x19], #16
846	tbnz		x6, #7, 1f
847	st1		{\o7\().16b}, [x19], #16
848
849	cbz		x23, 1f
850	st1		{v25.16b}, [x24]
851
852	cond_yield_neon	0b
853	b		99b
854
8551:	st1		{v25.16b}, [x24]
856	frame_pop
857	ret
858	.endm
859
860ENTRY(aesbs_xts_encrypt)
861	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
862ENDPROC(aesbs_xts_encrypt)
863
864ENTRY(aesbs_xts_decrypt)
865	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
866ENDPROC(aesbs_xts_decrypt)
867
868	.macro		next_ctr, v
869	mov		\v\().d[1], x8
870	adds		x8, x8, #1
871	mov		\v\().d[0], x7
872	adc		x7, x7, xzr
873	rev64		\v\().16b, \v\().16b
874	.endm
875
876	/*
877	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
878	 *		     int rounds, int blocks, u8 iv[], u8 final[])
879	 */
880ENTRY(aesbs_ctr_encrypt)
881	frame_push	8
882
883	mov		x19, x0
884	mov		x20, x1
885	mov		x21, x2
886	mov		x22, x3
887	mov		x23, x4
888	mov		x24, x5
889	mov		x25, x6
890
891	cmp		x25, #0
892	cset		x26, ne
893	add		x23, x23, x26		// do one extra block if final
894
89598:	ldp		x7, x8, [x24]
896	ld1		{v0.16b}, [x24]
897CPU_LE(	rev		x7, x7		)
898CPU_LE(	rev		x8, x8		)
899	adds		x8, x8, #1
900	adc		x7, x7, xzr
901
90299:	mov		x9, #1
903	lsl		x9, x9, x23
904	subs		w23, w23, #8
905	csel		x23, x23, xzr, pl
906	csel		x9, x9, xzr, le
907
908	tbnz		x9, #1, 0f
909	next_ctr	v1
910	tbnz		x9, #2, 0f
911	next_ctr	v2
912	tbnz		x9, #3, 0f
913	next_ctr	v3
914	tbnz		x9, #4, 0f
915	next_ctr	v4
916	tbnz		x9, #5, 0f
917	next_ctr	v5
918	tbnz		x9, #6, 0f
919	next_ctr	v6
920	tbnz		x9, #7, 0f
921	next_ctr	v7
922
9230:	mov		bskey, x21
924	mov		rounds, x22
925	bl		aesbs_encrypt8
926
927	lsr		x9, x9, x26		// disregard the extra block
928	tbnz		x9, #0, 0f
929
930	ld1		{v8.16b}, [x20], #16
931	eor		v0.16b, v0.16b, v8.16b
932	st1		{v0.16b}, [x19], #16
933	tbnz		x9, #1, 1f
934
935	ld1		{v9.16b}, [x20], #16
936	eor		v1.16b, v1.16b, v9.16b
937	st1		{v1.16b}, [x19], #16
938	tbnz		x9, #2, 2f
939
940	ld1		{v10.16b}, [x20], #16
941	eor		v4.16b, v4.16b, v10.16b
942	st1		{v4.16b}, [x19], #16
943	tbnz		x9, #3, 3f
944
945	ld1		{v11.16b}, [x20], #16
946	eor		v6.16b, v6.16b, v11.16b
947	st1		{v6.16b}, [x19], #16
948	tbnz		x9, #4, 4f
949
950	ld1		{v12.16b}, [x20], #16
951	eor		v3.16b, v3.16b, v12.16b
952	st1		{v3.16b}, [x19], #16
953	tbnz		x9, #5, 5f
954
955	ld1		{v13.16b}, [x20], #16
956	eor		v7.16b, v7.16b, v13.16b
957	st1		{v7.16b}, [x19], #16
958	tbnz		x9, #6, 6f
959
960	ld1		{v14.16b}, [x20], #16
961	eor		v2.16b, v2.16b, v14.16b
962	st1		{v2.16b}, [x19], #16
963	tbnz		x9, #7, 7f
964
965	ld1		{v15.16b}, [x20], #16
966	eor		v5.16b, v5.16b, v15.16b
967	st1		{v5.16b}, [x19], #16
968
9698:	next_ctr	v0
970	st1		{v0.16b}, [x24]
971	cbz		x23, .Lctr_done
972
973	cond_yield_neon	98b
974	b		99b
975
976.Lctr_done:
977	frame_pop
978	ret
979
980	/*
981	 * If we are handling the tail of the input (x6 != NULL), return the
982	 * final keystream block back to the caller.
983	 */
9840:	cbz		x25, 8b
985	st1		{v0.16b}, [x25]
986	b		8b
9871:	cbz		x25, 8b
988	st1		{v1.16b}, [x25]
989	b		8b
9902:	cbz		x25, 8b
991	st1		{v4.16b}, [x25]
992	b		8b
9933:	cbz		x25, 8b
994	st1		{v6.16b}, [x25]
995	b		8b
9964:	cbz		x25, 8b
997	st1		{v3.16b}, [x25]
998	b		8b
9995:	cbz		x25, 8b
1000	st1		{v7.16b}, [x25]
1001	b		8b
10026:	cbz		x25, 8b
1003	st1		{v2.16b}, [x25]
1004	b		8b
10057:	cbz		x25, 8b
1006	st1		{v5.16b}, [x25]
1007	b		8b
1008ENDPROC(aesbs_ctr_encrypt)
1009