1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4 *
5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	SHASH		.req	v0
12	SHASH2		.req	v1
13	T1		.req	v2
14	T2		.req	v3
15	MASK		.req	v4
16	XM		.req	v5
17	XL		.req	v6
18	XH		.req	v7
19	IN1		.req	v7
20
21	k00_16		.req	v8
22	k32_48		.req	v9
23
24	t3		.req	v10
25	t4		.req	v11
26	t5		.req	v12
27	t6		.req	v13
28	t7		.req	v14
29	t8		.req	v15
30	t9		.req	v16
31
32	perm1		.req	v17
33	perm2		.req	v18
34	perm3		.req	v19
35
36	sh1		.req	v20
37	sh2		.req	v21
38	sh3		.req	v22
39	sh4		.req	v23
40
41	ss1		.req	v24
42	ss2		.req	v25
43	ss3		.req	v26
44	ss4		.req	v27
45
46	XL2		.req	v8
47	XM2		.req	v9
48	XH2		.req	v10
49	XL3		.req	v11
50	XM3		.req	v12
51	XH3		.req	v13
52	TT3		.req	v14
53	TT4		.req	v15
54	HH		.req	v16
55	HH3		.req	v17
56	HH4		.req	v18
57	HH34		.req	v19
58
59	.text
60	.arch		armv8-a+crypto
61
62	.macro		__pmull_p64, rd, rn, rm
63	pmull		\rd\().1q, \rn\().1d, \rm\().1d
64	.endm
65
66	.macro		__pmull2_p64, rd, rn, rm
67	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
68	.endm
69
70	.macro		__pmull_p8, rq, ad, bd
71	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
72	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
73	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
74
75	__pmull_p8_\bd	\rq, \ad
76	.endm
77
78	.macro		__pmull2_p8, rq, ad, bd
79	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
80	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
81	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
82
83	__pmull2_p8_\bd	\rq, \ad
84	.endm
85
86	.macro		__pmull_p8_SHASH, rq, ad
87	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
88	.endm
89
90	.macro		__pmull_p8_SHASH2, rq, ad
91	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
92	.endm
93
94	.macro		__pmull2_p8_SHASH, rq, ad
95	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
96	.endm
97
98	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
99	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
100	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
101	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
102	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
103	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
104	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
105	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
106	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
107
108	eor		t3.16b, t3.16b, t4.16b			// L = E + F
109	eor		t5.16b, t5.16b, t6.16b			// M = G + H
110	eor		t7.16b, t7.16b, t8.16b			// N = I + J
111
112	uzp1		t4.2d, t3.2d, t5.2d
113	uzp2		t3.2d, t3.2d, t5.2d
114	uzp1		t6.2d, t7.2d, t9.2d
115	uzp2		t7.2d, t7.2d, t9.2d
116
117	// t3 = (L) (P0 + P1) << 8
118	// t5 = (M) (P2 + P3) << 16
119	eor		t4.16b, t4.16b, t3.16b
120	and		t3.16b, t3.16b, k32_48.16b
121
122	// t7 = (N) (P4 + P5) << 24
123	// t9 = (K) (P6 + P7) << 32
124	eor		t6.16b, t6.16b, t7.16b
125	and		t7.16b, t7.16b, k00_16.16b
126
127	eor		t4.16b, t4.16b, t3.16b
128	eor		t6.16b, t6.16b, t7.16b
129
130	zip2		t5.2d, t4.2d, t3.2d
131	zip1		t3.2d, t4.2d, t3.2d
132	zip2		t9.2d, t6.2d, t7.2d
133	zip1		t7.2d, t6.2d, t7.2d
134
135	ext		t3.16b, t3.16b, t3.16b, #15
136	ext		t5.16b, t5.16b, t5.16b, #14
137	ext		t7.16b, t7.16b, t7.16b, #13
138	ext		t9.16b, t9.16b, t9.16b, #12
139
140	eor		t3.16b, t3.16b, t5.16b
141	eor		t7.16b, t7.16b, t9.16b
142	eor		\rq\().16b, \rq\().16b, t3.16b
143	eor		\rq\().16b, \rq\().16b, t7.16b
144	.endm
145
146	.macro		__pmull_pre_p64
147	add		x8, x3, #16
148	ld1		{HH.2d-HH4.2d}, [x8]
149
150	trn1		SHASH2.2d, SHASH.2d, HH.2d
151	trn2		T1.2d, SHASH.2d, HH.2d
152	eor		SHASH2.16b, SHASH2.16b, T1.16b
153
154	trn1		HH34.2d, HH3.2d, HH4.2d
155	trn2		T1.2d, HH3.2d, HH4.2d
156	eor		HH34.16b, HH34.16b, T1.16b
157
158	movi		MASK.16b, #0xe1
159	shl		MASK.2d, MASK.2d, #57
160	.endm
161
162	.macro		__pmull_pre_p8
163	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
164	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
165
166	// k00_16 := 0x0000000000000000_000000000000ffff
167	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
168	movi		k32_48.2d, #0xffffffff
169	mov		k32_48.h[2], k32_48.h[0]
170	ushr		k00_16.2d, k32_48.2d, #32
171
172	// prepare the permutation vectors
173	mov_q		x5, 0x080f0e0d0c0b0a09
174	movi		T1.8b, #8
175	dup		perm1.2d, x5
176	eor		perm1.16b, perm1.16b, T1.16b
177	ushr		perm2.2d, perm1.2d, #8
178	ushr		perm3.2d, perm1.2d, #16
179	ushr		T1.2d, perm1.2d, #24
180	sli		perm2.2d, perm1.2d, #56
181	sli		perm3.2d, perm1.2d, #48
182	sli		T1.2d, perm1.2d, #40
183
184	// precompute loop invariants
185	tbl		sh1.16b, {SHASH.16b}, perm1.16b
186	tbl		sh2.16b, {SHASH.16b}, perm2.16b
187	tbl		sh3.16b, {SHASH.16b}, perm3.16b
188	tbl		sh4.16b, {SHASH.16b}, T1.16b
189	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
190	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
191	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
192	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
193	.endm
194
195	//
196	// PMULL (64x64->128) based reduction for CPUs that can do
197	// it in a single instruction.
198	//
199	.macro		__pmull_reduce_p64
200	pmull		T2.1q, XL.1d, MASK.1d
201	eor		XM.16b, XM.16b, T1.16b
202
203	mov		XH.d[0], XM.d[1]
204	mov		XM.d[1], XL.d[0]
205
206	eor		XL.16b, XM.16b, T2.16b
207	ext		T2.16b, XL.16b, XL.16b, #8
208	pmull		XL.1q, XL.1d, MASK.1d
209	.endm
210
211	//
212	// Alternative reduction for CPUs that lack support for the
213	// 64x64->128 PMULL instruction
214	//
215	.macro		__pmull_reduce_p8
216	eor		XM.16b, XM.16b, T1.16b
217
218	mov		XL.d[1], XM.d[0]
219	mov		XH.d[0], XM.d[1]
220
221	shl		T1.2d, XL.2d, #57
222	shl		T2.2d, XL.2d, #62
223	eor		T2.16b, T2.16b, T1.16b
224	shl		T1.2d, XL.2d, #63
225	eor		T2.16b, T2.16b, T1.16b
226	ext		T1.16b, XL.16b, XH.16b, #8
227	eor		T2.16b, T2.16b, T1.16b
228
229	mov		XL.d[1], T2.d[0]
230	mov		XH.d[0], T2.d[1]
231
232	ushr		T2.2d, XL.2d, #1
233	eor		XH.16b, XH.16b, XL.16b
234	eor		XL.16b, XL.16b, T2.16b
235	ushr		T2.2d, T2.2d, #6
236	ushr		XL.2d, XL.2d, #1
237	.endm
238
239	.macro		__pmull_ghash, pn
240	ld1		{SHASH.2d}, [x3]
241	ld1		{XL.2d}, [x1]
242
243	__pmull_pre_\pn
244
245	/* do the head block first, if supplied */
246	cbz		x4, 0f
247	ld1		{T1.2d}, [x4]
248	mov		x4, xzr
249	b		3f
250
2510:	.ifc		\pn, p64
252	tbnz		w0, #0, 2f		// skip until #blocks is a
253	tbnz		w0, #1, 2f		// round multiple of 4
254
2551:	ld1		{XM3.16b-TT4.16b}, [x2], #64
256
257	sub		w0, w0, #4
258
259	rev64		T1.16b, XM3.16b
260	rev64		T2.16b, XH3.16b
261	rev64		TT4.16b, TT4.16b
262	rev64		TT3.16b, TT3.16b
263
264	ext		IN1.16b, TT4.16b, TT4.16b, #8
265	ext		XL3.16b, TT3.16b, TT3.16b, #8
266
267	eor		TT4.16b, TT4.16b, IN1.16b
268	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
269	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
270	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
271
272	eor		TT3.16b, TT3.16b, XL3.16b
273	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
274	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
275	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
276
277	ext		IN1.16b, T2.16b, T2.16b, #8
278	eor		XL2.16b, XL2.16b, XL3.16b
279	eor		XH2.16b, XH2.16b, XH3.16b
280	eor		XM2.16b, XM2.16b, XM3.16b
281
282	eor		T2.16b, T2.16b, IN1.16b
283	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
284	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
285	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
286
287	eor		XL2.16b, XL2.16b, XL3.16b
288	eor		XH2.16b, XH2.16b, XH3.16b
289	eor		XM2.16b, XM2.16b, XM3.16b
290
291	ext		IN1.16b, T1.16b, T1.16b, #8
292	ext		TT3.16b, XL.16b, XL.16b, #8
293	eor		XL.16b, XL.16b, IN1.16b
294	eor		T1.16b, T1.16b, TT3.16b
295
296	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
297	eor		T1.16b, T1.16b, XL.16b
298	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
299	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
300
301	eor		XL.16b, XL.16b, XL2.16b
302	eor		XH.16b, XH.16b, XH2.16b
303	eor		XM.16b, XM.16b, XM2.16b
304
305	eor		T2.16b, XL.16b, XH.16b
306	ext		T1.16b, XL.16b, XH.16b, #8
307	eor		XM.16b, XM.16b, T2.16b
308
309	__pmull_reduce_p64
310
311	eor		T2.16b, T2.16b, XH.16b
312	eor		XL.16b, XL.16b, T2.16b
313
314	cbz		w0, 5f
315	b		1b
316	.endif
317
3182:	ld1		{T1.2d}, [x2], #16
319	sub		w0, w0, #1
320
3213:	/* multiply XL by SHASH in GF(2^128) */
322CPU_LE(	rev64		T1.16b, T1.16b	)
323
324	ext		T2.16b, XL.16b, XL.16b, #8
325	ext		IN1.16b, T1.16b, T1.16b, #8
326	eor		T1.16b, T1.16b, T2.16b
327	eor		XL.16b, XL.16b, IN1.16b
328
329	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
330	eor		T1.16b, T1.16b, XL.16b
331	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
332	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
333
3344:	eor		T2.16b, XL.16b, XH.16b
335	ext		T1.16b, XL.16b, XH.16b, #8
336	eor		XM.16b, XM.16b, T2.16b
337
338	__pmull_reduce_\pn
339
340	eor		T2.16b, T2.16b, XH.16b
341	eor		XL.16b, XL.16b, T2.16b
342
343	cbnz		w0, 0b
344
3455:	st1		{XL.2d}, [x1]
346	ret
347	.endm
348
349	/*
350	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
351	 *			   struct ghash_key const *k, const char *head)
352	 */
353SYM_FUNC_START(pmull_ghash_update_p64)
354	__pmull_ghash	p64
355SYM_FUNC_END(pmull_ghash_update_p64)
356
357SYM_FUNC_START(pmull_ghash_update_p8)
358	__pmull_ghash	p8
359SYM_FUNC_END(pmull_ghash_update_p8)
360
361	KS0		.req	v8
362	KS1		.req	v9
363	KS2		.req	v10
364	KS3		.req	v11
365
366	INP0		.req	v21
367	INP1		.req	v22
368	INP2		.req	v23
369	INP3		.req	v24
370
371	K0		.req	v25
372	K1		.req	v26
373	K2		.req	v27
374	K3		.req	v28
375	K4		.req	v12
376	K5		.req	v13
377	K6		.req	v4
378	K7		.req	v5
379	K8		.req	v14
380	K9		.req	v15
381	KK		.req	v29
382	KL		.req	v30
383	KM		.req	v31
384
385	.macro		load_round_keys, rounds, rk, tmp
386	add		\tmp, \rk, #64
387	ld1		{K0.4s-K3.4s}, [\rk]
388	ld1		{K4.4s-K5.4s}, [\tmp]
389	add		\tmp, \rk, \rounds, lsl #4
390	sub		\tmp, \tmp, #32
391	ld1		{KK.4s-KM.4s}, [\tmp]
392	.endm
393
394	.macro		enc_round, state, key
395	aese		\state\().16b, \key\().16b
396	aesmc		\state\().16b, \state\().16b
397	.endm
398
399	.macro		enc_qround, s0, s1, s2, s3, key
400	enc_round	\s0, \key
401	enc_round	\s1, \key
402	enc_round	\s2, \key
403	enc_round	\s3, \key
404	.endm
405
406	.macro		enc_block, state, rounds, rk, tmp
407	add		\tmp, \rk, #96
408	ld1		{K6.4s-K7.4s}, [\tmp], #32
409	.irp		key, K0, K1, K2, K3, K4 K5
410	enc_round	\state, \key
411	.endr
412
413	tbnz		\rounds, #2, .Lnot128_\@
414.Lout256_\@:
415	enc_round	\state, K6
416	enc_round	\state, K7
417
418.Lout192_\@:
419	enc_round	\state, KK
420	aese		\state\().16b, KL.16b
421	eor		\state\().16b, \state\().16b, KM.16b
422
423	.subsection	1
424.Lnot128_\@:
425	ld1		{K8.4s-K9.4s}, [\tmp], #32
426	enc_round	\state, K6
427	enc_round	\state, K7
428	ld1		{K6.4s-K7.4s}, [\tmp]
429	enc_round	\state, K8
430	enc_round	\state, K9
431	tbz		\rounds, #1, .Lout192_\@
432	b		.Lout256_\@
433	.previous
434	.endm
435
436	.align		6
437	.macro		pmull_gcm_do_crypt, enc
438	stp		x29, x30, [sp, #-32]!
439	mov		x29, sp
440	str		x19, [sp, #24]
441
442	load_round_keys	x7, x6, x8
443
444	ld1		{SHASH.2d}, [x3], #16
445	ld1		{HH.2d-HH4.2d}, [x3]
446
447	trn1		SHASH2.2d, SHASH.2d, HH.2d
448	trn2		T1.2d, SHASH.2d, HH.2d
449	eor		SHASH2.16b, SHASH2.16b, T1.16b
450
451	trn1		HH34.2d, HH3.2d, HH4.2d
452	trn2		T1.2d, HH3.2d, HH4.2d
453	eor		HH34.16b, HH34.16b, T1.16b
454
455	ld1		{XL.2d}, [x4]
456
457	cbz		x0, 3f				// tag only?
458
459	ldr		w8, [x5, #12]			// load lower counter
460CPU_LE(	rev		w8, w8		)
461
4620:	mov		w9, #4				// max blocks per round
463	add		x10, x0, #0xf
464	lsr		x10, x10, #4			// remaining blocks
465
466	subs		x0, x0, #64
467	csel		w9, w10, w9, mi
468	add		w8, w8, w9
469
470	bmi		1f
471	ld1		{INP0.16b-INP3.16b}, [x2], #64
472	.subsection	1
473	/*
474	 * Populate the four input registers right to left with up to 63 bytes
475	 * of data, using overlapping loads to avoid branches.
476	 *
477	 *                INP0     INP1     INP2     INP3
478	 *  1 byte     |        |        |        |x       |
479	 * 16 bytes    |        |        |        |xxxxxxxx|
480	 * 17 bytes    |        |        |xxxxxxxx|x       |
481	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
482	 * etc etc
483	 *
484	 * Note that this code may read up to 15 bytes before the start of
485	 * the input. It is up to the calling code to ensure this is safe if
486	 * this happens in the first iteration of the loop (i.e., when the
487	 * input size is < 16 bytes)
488	 */
4891:	mov		x15, #16
490	ands		x19, x0, #0xf
491	csel		x19, x19, x15, ne
492	adr_l		x17, .Lpermute_table + 16
493
494	sub		x11, x15, x19
495	add		x12, x17, x11
496	sub		x17, x17, x11
497	ld1		{T1.16b}, [x12]
498	sub		x10, x1, x11
499	sub		x11, x2, x11
500
501	cmp		x0, #-16
502	csel		x14, x15, xzr, gt
503	cmp		x0, #-32
504	csel		x15, x15, xzr, gt
505	cmp		x0, #-48
506	csel		x16, x19, xzr, gt
507	csel		x1, x1, x10, gt
508	csel		x2, x2, x11, gt
509
510	ld1		{INP0.16b}, [x2], x14
511	ld1		{INP1.16b}, [x2], x15
512	ld1		{INP2.16b}, [x2], x16
513	ld1		{INP3.16b}, [x2]
514	tbl		INP3.16b, {INP3.16b}, T1.16b
515	b		2f
516	.previous
517
5182:	.if		\enc == 0
519	bl		pmull_gcm_ghash_4x
520	.endif
521
522	bl		pmull_gcm_enc_4x
523
524	tbnz		x0, #63, 6f
525	st1		{INP0.16b-INP3.16b}, [x1], #64
526	.if		\enc == 1
527	bl		pmull_gcm_ghash_4x
528	.endif
529	bne		0b
530
5313:	ldp		x19, x10, [sp, #24]
532	cbz		x10, 5f				// output tag?
533
534	ld1		{INP3.16b}, [x10]		// load lengths[]
535	mov		w9, #1
536	bl		pmull_gcm_ghash_4x
537
538	mov		w11, #(0x1 << 24)		// BE '1U'
539	ld1		{KS0.16b}, [x5]
540	mov		KS0.s[3], w11
541
542	enc_block	KS0, x7, x6, x12
543
544	ext		XL.16b, XL.16b, XL.16b, #8
545	rev64		XL.16b, XL.16b
546	eor		XL.16b, XL.16b, KS0.16b
547
548	.if		\enc == 1
549	st1		{XL.16b}, [x10]			// store tag
550	.else
551	ldp		x11, x12, [sp, #40]		// load tag pointer and authsize
552	adr_l		x17, .Lpermute_table
553	ld1		{KS0.16b}, [x11]		// load supplied tag
554	add		x17, x17, x12
555	ld1		{KS1.16b}, [x17]		// load permute vector
556
557	cmeq		XL.16b, XL.16b, KS0.16b		// compare tags
558	mvn		XL.16b, XL.16b			// -1 for fail, 0 for pass
559	tbl		XL.16b, {XL.16b}, KS1.16b	// keep authsize bytes only
560	sminv		b0, XL.16b			// signed minimum across XL
561	smov		w0, v0.b[0]			// return b0
562	.endif
563
5644:	ldp		x29, x30, [sp], #32
565	ret
566
5675:
568CPU_LE(	rev		w8, w8		)
569	str		w8, [x5, #12]			// store lower counter
570	st1		{XL.2d}, [x4]
571	b		4b
572
5736:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
574	sub		x17, x17, x19, lsl #1
575
576	cmp		w9, #1
577	beq		7f
578	.subsection	1
5797:	ld1		{INP2.16b}, [x1]
580	tbx		INP2.16b, {INP3.16b}, T1.16b
581	mov		INP3.16b, INP2.16b
582	b		8f
583	.previous
584
585	st1		{INP0.16b}, [x1], x14
586	st1		{INP1.16b}, [x1], x15
587	st1		{INP2.16b}, [x1], x16
588	tbl		INP3.16b, {INP3.16b}, T1.16b
589	tbx		INP3.16b, {INP2.16b}, T2.16b
5908:	st1		{INP3.16b}, [x1]
591
592	.if		\enc == 1
593	ld1		{T1.16b}, [x17]
594	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
595	bl		pmull_gcm_ghash_4x
596	.endif
597	b		3b
598	.endm
599
600	/*
601	 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
602	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
603	 *			  int rounds, u8 tag)
604	 */
605SYM_FUNC_START(pmull_gcm_encrypt)
606	pmull_gcm_do_crypt	1
607SYM_FUNC_END(pmull_gcm_encrypt)
608
609	/*
610	 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
611	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
612	 *			  int rounds, u8 tag)
613	 */
614SYM_FUNC_START(pmull_gcm_decrypt)
615	pmull_gcm_do_crypt	0
616SYM_FUNC_END(pmull_gcm_decrypt)
617
618SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
619	movi		MASK.16b, #0xe1
620	shl		MASK.2d, MASK.2d, #57
621
622	rev64		T1.16b, INP0.16b
623	rev64		T2.16b, INP1.16b
624	rev64		TT3.16b, INP2.16b
625	rev64		TT4.16b, INP3.16b
626
627	ext		XL.16b, XL.16b, XL.16b, #8
628
629	tbz		w9, #2, 0f			// <4 blocks?
630	.subsection	1
6310:	movi		XH2.16b, #0
632	movi		XM2.16b, #0
633	movi		XL2.16b, #0
634
635	tbz		w9, #0, 1f			// 2 blocks?
636	tbz		w9, #1, 2f			// 1 block?
637
638	eor		T2.16b, T2.16b, XL.16b
639	ext		T1.16b, T2.16b, T2.16b, #8
640	b		.Lgh3
641
6421:	eor		TT3.16b, TT3.16b, XL.16b
643	ext		T2.16b, TT3.16b, TT3.16b, #8
644	b		.Lgh2
645
6462:	eor		TT4.16b, TT4.16b, XL.16b
647	ext		IN1.16b, TT4.16b, TT4.16b, #8
648	b		.Lgh1
649	.previous
650
651	eor		T1.16b, T1.16b, XL.16b
652	ext		IN1.16b, T1.16b, T1.16b, #8
653
654	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
655	eor		T1.16b, T1.16b, IN1.16b
656	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
657	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
658
659	ext		T1.16b, T2.16b, T2.16b, #8
660.Lgh3:	eor		T2.16b, T2.16b, T1.16b
661	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
662	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
663	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
664
665	eor		XH2.16b, XH2.16b, XH.16b
666	eor		XL2.16b, XL2.16b, XL.16b
667	eor		XM2.16b, XM2.16b, XM.16b
668
669	ext		T2.16b, TT3.16b, TT3.16b, #8
670.Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
671	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
672	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
673	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
674
675	eor		XH2.16b, XH2.16b, XH.16b
676	eor		XL2.16b, XL2.16b, XL.16b
677	eor		XM2.16b, XM2.16b, XM.16b
678
679	ext		IN1.16b, TT4.16b, TT4.16b, #8
680.Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
681	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
682	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
683	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
684
685	eor		XH.16b, XH.16b, XH2.16b
686	eor		XL.16b, XL.16b, XL2.16b
687	eor		XM.16b, XM.16b, XM2.16b
688
689	eor		T2.16b, XL.16b, XH.16b
690	ext		T1.16b, XL.16b, XH.16b, #8
691	eor		XM.16b, XM.16b, T2.16b
692
693	__pmull_reduce_p64
694
695	eor		T2.16b, T2.16b, XH.16b
696	eor		XL.16b, XL.16b, T2.16b
697
698	ret
699SYM_FUNC_END(pmull_gcm_ghash_4x)
700
701SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
702	ld1		{KS0.16b}, [x5]			// load upper counter
703	sub		w10, w8, #4
704	sub		w11, w8, #3
705	sub		w12, w8, #2
706	sub		w13, w8, #1
707	rev		w10, w10
708	rev		w11, w11
709	rev		w12, w12
710	rev		w13, w13
711	mov		KS1.16b, KS0.16b
712	mov		KS2.16b, KS0.16b
713	mov		KS3.16b, KS0.16b
714	ins		KS0.s[3], w10			// set lower counter
715	ins		KS1.s[3], w11
716	ins		KS2.s[3], w12
717	ins		KS3.s[3], w13
718
719	add		x10, x6, #96			// round key pointer
720	ld1		{K6.4s-K7.4s}, [x10], #32
721	.irp		key, K0, K1, K2, K3, K4, K5
722	enc_qround	KS0, KS1, KS2, KS3, \key
723	.endr
724
725	tbnz		x7, #2, .Lnot128
726	.subsection	1
727.Lnot128:
728	ld1		{K8.4s-K9.4s}, [x10], #32
729	.irp		key, K6, K7
730	enc_qround	KS0, KS1, KS2, KS3, \key
731	.endr
732	ld1		{K6.4s-K7.4s}, [x10]
733	.irp		key, K8, K9
734	enc_qround	KS0, KS1, KS2, KS3, \key
735	.endr
736	tbz		x7, #1, .Lout192
737	b		.Lout256
738	.previous
739
740.Lout256:
741	.irp		key, K6, K7
742	enc_qround	KS0, KS1, KS2, KS3, \key
743	.endr
744
745.Lout192:
746	enc_qround	KS0, KS1, KS2, KS3, KK
747
748	aese		KS0.16b, KL.16b
749	aese		KS1.16b, KL.16b
750	aese		KS2.16b, KL.16b
751	aese		KS3.16b, KL.16b
752
753	eor		KS0.16b, KS0.16b, KM.16b
754	eor		KS1.16b, KS1.16b, KM.16b
755	eor		KS2.16b, KS2.16b, KM.16b
756	eor		KS3.16b, KS3.16b, KM.16b
757
758	eor		INP0.16b, INP0.16b, KS0.16b
759	eor		INP1.16b, INP1.16b, KS1.16b
760	eor		INP2.16b, INP2.16b, KS2.16b
761	eor		INP3.16b, INP3.16b, KS3.16b
762
763	ret
764SYM_FUNC_END(pmull_gcm_enc_4x)
765
766	.section	".rodata", "a"
767	.align		6
768.Lpermute_table:
769	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
771	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
772	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
773	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
775	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
776	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
777	.previous
778