xref: /linux/arch/arm64/crypto/aes-neon.S (revision 44f57d78)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
4 *
5 * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11#define AES_ENTRY(func)		ENTRY(neon_ ## func)
12#define AES_ENDPROC(func)	ENDPROC(neon_ ## func)
13
14	xtsmask		.req	v7
15
16	.macro		xts_reload_mask, tmp
17	xts_load_mask	\tmp
18	.endm
19
20	/* multiply by polynomial 'x' in GF(2^8) */
21	.macro		mul_by_x, out, in, temp, const
22	sshr		\temp, \in, #7
23	shl		\out, \in, #1
24	and		\temp, \temp, \const
25	eor		\out, \out, \temp
26	.endm
27
28	/* multiply by polynomial 'x^2' in GF(2^8) */
29	.macro		mul_by_x2, out, in, temp, const
30	ushr		\temp, \in, #6
31	shl		\out, \in, #2
32	pmul		\temp, \temp, \const
33	eor		\out, \out, \temp
34	.endm
35
36	/* preload the entire Sbox */
37	.macro		prepare, sbox, shiftrows, temp
38	movi		v12.16b, #0x1b
39	ldr_l		q13, \shiftrows, \temp
40	ldr_l		q14, .Lror32by8, \temp
41	adr_l		\temp, \sbox
42	ld1		{v16.16b-v19.16b}, [\temp], #64
43	ld1		{v20.16b-v23.16b}, [\temp], #64
44	ld1		{v24.16b-v27.16b}, [\temp], #64
45	ld1		{v28.16b-v31.16b}, [\temp]
46	.endm
47
48	/* do preload for encryption */
49	.macro		enc_prepare, ignore0, ignore1, temp
50	prepare		.LForward_Sbox, .LForward_ShiftRows, \temp
51	.endm
52
53	.macro		enc_switch_key, ignore0, ignore1, temp
54	/* do nothing */
55	.endm
56
57	/* do preload for decryption */
58	.macro		dec_prepare, ignore0, ignore1, temp
59	prepare		.LReverse_Sbox, .LReverse_ShiftRows, \temp
60	.endm
61
62	/* apply SubBytes transformation using the the preloaded Sbox */
63	.macro		sub_bytes, in
64	sub		v9.16b, \in\().16b, v15.16b
65	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
66	sub		v10.16b, v9.16b, v15.16b
67	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
68	sub		v11.16b, v10.16b, v15.16b
69	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
70	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
71	.endm
72
73	/* apply MixColumns transformation */
74	.macro		mix_columns, in, enc
75	.if		\enc == 0
76	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
77	mul_by_x2	v8.16b, \in\().16b, v9.16b, v12.16b
78	eor		\in\().16b, \in\().16b, v8.16b
79	rev32		v8.8h, v8.8h
80	eor		\in\().16b, \in\().16b, v8.16b
81	.endif
82
83	mul_by_x	v9.16b, \in\().16b, v8.16b, v12.16b
84	rev32		v8.8h, \in\().8h
85	eor		v8.16b, v8.16b, v9.16b
86	eor		\in\().16b, \in\().16b, v8.16b
87	tbl		\in\().16b, {\in\().16b}, v14.16b
88	eor		\in\().16b, \in\().16b, v8.16b
89	.endm
90
91	.macro		do_block, enc, in, rounds, rk, rkp, i
92	ld1		{v15.4s}, [\rk]
93	add		\rkp, \rk, #16
94	mov		\i, \rounds
951111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
96	movi		v15.16b, #0x40
97	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
98	sub_bytes	\in
99	subs		\i, \i, #1
100	ld1		{v15.4s}, [\rkp], #16
101	beq		2222f
102	mix_columns	\in, \enc
103	b		1111b
1042222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
105	.endm
106
107	.macro		encrypt_block, in, rounds, rk, rkp, i
108	do_block	1, \in, \rounds, \rk, \rkp, \i
109	.endm
110
111	.macro		decrypt_block, in, rounds, rk, rkp, i
112	do_block	0, \in, \rounds, \rk, \rkp, \i
113	.endm
114
115	/*
116	 * Interleaved versions: functionally equivalent to the
117	 * ones above, but applied to 2 or 4 AES states in parallel.
118	 */
119
120	.macro		sub_bytes_2x, in0, in1
121	sub		v8.16b, \in0\().16b, v15.16b
122	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
123	sub		v9.16b, \in1\().16b, v15.16b
124	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
125	sub		v10.16b, v8.16b, v15.16b
126	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
127	sub		v11.16b, v9.16b, v15.16b
128	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
129	sub		v8.16b, v10.16b, v15.16b
130	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
131	sub		v9.16b, v11.16b, v15.16b
132	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
133	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
134	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
135	.endm
136
137	.macro		sub_bytes_4x, in0, in1, in2, in3
138	sub		v8.16b, \in0\().16b, v15.16b
139	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
140	sub		v9.16b, \in1\().16b, v15.16b
141	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
142	sub		v10.16b, \in2\().16b, v15.16b
143	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
144	sub		v11.16b, \in3\().16b, v15.16b
145	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
146	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
147	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
148	sub		v8.16b, v8.16b, v15.16b
149	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
150	sub		v9.16b, v9.16b, v15.16b
151	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
152	sub		v10.16b, v10.16b, v15.16b
153	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
154	sub		v11.16b, v11.16b, v15.16b
155	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
156	sub		v8.16b, v8.16b, v15.16b
157	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
158	sub		v9.16b, v9.16b, v15.16b
159	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
160	sub		v10.16b, v10.16b, v15.16b
161	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
162	sub		v11.16b, v11.16b, v15.16b
163	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
164	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
165	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
166	.endm
167
168	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
169	sshr		\tmp0\().16b, \in0\().16b, #7
170	shl		\out0\().16b, \in0\().16b, #1
171	sshr		\tmp1\().16b, \in1\().16b, #7
172	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
173	shl		\out1\().16b, \in1\().16b, #1
174	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
175	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
176	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
177	.endm
178
179	.macro		mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
180	ushr		\tmp0\().16b, \in0\().16b, #6
181	shl		\out0\().16b, \in0\().16b, #2
182	ushr		\tmp1\().16b, \in1\().16b, #6
183	pmul		\tmp0\().16b, \tmp0\().16b, \const\().16b
184	shl		\out1\().16b, \in1\().16b, #2
185	pmul		\tmp1\().16b, \tmp1\().16b, \const\().16b
186	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
187	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
188	.endm
189
190	.macro		mix_columns_2x, in0, in1, enc
191	.if		\enc == 0
192	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
193	mul_by_x2_2x	v8, v9, \in0, \in1, v10, v11, v12
194	eor		\in0\().16b, \in0\().16b, v8.16b
195	rev32		v8.8h, v8.8h
196	eor		\in1\().16b, \in1\().16b, v9.16b
197	rev32		v9.8h, v9.8h
198	eor		\in0\().16b, \in0\().16b, v8.16b
199	eor		\in1\().16b, \in1\().16b, v9.16b
200	.endif
201
202	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v12
203	rev32		v10.8h, \in0\().8h
204	rev32		v11.8h, \in1\().8h
205	eor		v10.16b, v10.16b, v8.16b
206	eor		v11.16b, v11.16b, v9.16b
207	eor		\in0\().16b, \in0\().16b, v10.16b
208	eor		\in1\().16b, \in1\().16b, v11.16b
209	tbl		\in0\().16b, {\in0\().16b}, v14.16b
210	tbl		\in1\().16b, {\in1\().16b}, v14.16b
211	eor		\in0\().16b, \in0\().16b, v10.16b
212	eor		\in1\().16b, \in1\().16b, v11.16b
213	.endm
214
215	.macro		do_block_2x, enc, in0, in1, rounds, rk, rkp, i
216	ld1		{v15.4s}, [\rk]
217	add		\rkp, \rk, #16
218	mov		\i, \rounds
2191111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
220	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
221	movi		v15.16b, #0x40
222	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
223	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
224	sub_bytes_2x	\in0, \in1
225	subs		\i, \i, #1
226	ld1		{v15.4s}, [\rkp], #16
227	beq		2222f
228	mix_columns_2x	\in0, \in1, \enc
229	b		1111b
2302222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
231	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
232	.endm
233
234	.macro		do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
235	ld1		{v15.4s}, [\rk]
236	add		\rkp, \rk, #16
237	mov		\i, \rounds
2381111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
239	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
240	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
241	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
242	movi		v15.16b, #0x40
243	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
244	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
245	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
246	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
247	sub_bytes_4x	\in0, \in1, \in2, \in3
248	subs		\i, \i, #1
249	ld1		{v15.4s}, [\rkp], #16
250	beq		2222f
251	mix_columns_2x	\in0, \in1, \enc
252	mix_columns_2x	\in2, \in3, \enc
253	b		1111b
2542222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
255	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
256	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
257	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
258	.endm
259
260	.macro		encrypt_block2x, in0, in1, rounds, rk, rkp, i
261	do_block_2x	1, \in0, \in1, \rounds, \rk, \rkp, \i
262	.endm
263
264	.macro		decrypt_block2x, in0, in1, rounds, rk, rkp, i
265	do_block_2x	0, \in0, \in1, \rounds, \rk, \rkp, \i
266	.endm
267
268	.macro		encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
269	do_block_4x	1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
270	.endm
271
272	.macro		decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
273	do_block_4x	0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
274	.endm
275
276#include "aes-modes.S"
277
278	.section	".rodata", "a"
279	.align		6
280.LForward_Sbox:
281	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
282	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
283	.byte		0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
284	.byte		0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
285	.byte		0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
286	.byte		0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
287	.byte		0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
288	.byte		0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
289	.byte		0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
290	.byte		0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
291	.byte		0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
292	.byte		0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
293	.byte		0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
294	.byte		0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
295	.byte		0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
296	.byte		0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
297	.byte		0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
298	.byte		0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
299	.byte		0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
300	.byte		0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
301	.byte		0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
302	.byte		0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
303	.byte		0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
304	.byte		0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
305	.byte		0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
306	.byte		0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
307	.byte		0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
308	.byte		0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
309	.byte		0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
310	.byte		0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
311	.byte		0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
312	.byte		0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
313
314.LReverse_Sbox:
315	.byte		0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
316	.byte		0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
317	.byte		0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
318	.byte		0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
319	.byte		0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
320	.byte		0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
321	.byte		0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
322	.byte		0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
323	.byte		0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
324	.byte		0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
325	.byte		0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
326	.byte		0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
327	.byte		0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
328	.byte		0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
329	.byte		0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
330	.byte		0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
331	.byte		0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
332	.byte		0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
333	.byte		0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
334	.byte		0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
335	.byte		0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
336	.byte		0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
337	.byte		0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
338	.byte		0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
339	.byte		0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
340	.byte		0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
341	.byte		0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
342	.byte		0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
343	.byte		0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
344	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
345	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
346	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
347
348.LForward_ShiftRows:
349	.octa		0x0b06010c07020d08030e09040f0a0500
350
351.LReverse_ShiftRows:
352	.octa		0x0306090c0f0205080b0e0104070a0d00
353
354.Lror32by8:
355	.octa		0x0c0f0e0d080b0a090407060500030201
356