xref: /linux/arch/arm64/crypto/ghash-ce-core.S (revision 2da68a77)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4 *
5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <linux/cfi_types.h>
10#include <asm/assembler.h>
11
12	SHASH		.req	v0
13	SHASH2		.req	v1
14	T1		.req	v2
15	T2		.req	v3
16	MASK		.req	v4
17	XM		.req	v5
18	XL		.req	v6
19	XH		.req	v7
20	IN1		.req	v7
21
22	k00_16		.req	v8
23	k32_48		.req	v9
24
25	t3		.req	v10
26	t4		.req	v11
27	t5		.req	v12
28	t6		.req	v13
29	t7		.req	v14
30	t8		.req	v15
31	t9		.req	v16
32
33	perm1		.req	v17
34	perm2		.req	v18
35	perm3		.req	v19
36
37	sh1		.req	v20
38	sh2		.req	v21
39	sh3		.req	v22
40	sh4		.req	v23
41
42	ss1		.req	v24
43	ss2		.req	v25
44	ss3		.req	v26
45	ss4		.req	v27
46
47	XL2		.req	v8
48	XM2		.req	v9
49	XH2		.req	v10
50	XL3		.req	v11
51	XM3		.req	v12
52	XH3		.req	v13
53	TT3		.req	v14
54	TT4		.req	v15
55	HH		.req	v16
56	HH3		.req	v17
57	HH4		.req	v18
58	HH34		.req	v19
59
60	.text
61	.arch		armv8-a+crypto
62
63	.macro		__pmull_p64, rd, rn, rm
64	pmull		\rd\().1q, \rn\().1d, \rm\().1d
65	.endm
66
67	.macro		__pmull2_p64, rd, rn, rm
68	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
69	.endm
70
71	.macro		__pmull_p8, rq, ad, bd
72	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
73	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
74	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
75
76	__pmull_p8_\bd	\rq, \ad
77	.endm
78
79	.macro		__pmull2_p8, rq, ad, bd
80	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
81	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
82	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
83
84	__pmull2_p8_\bd	\rq, \ad
85	.endm
86
87	.macro		__pmull_p8_SHASH, rq, ad
88	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
89	.endm
90
91	.macro		__pmull_p8_SHASH2, rq, ad
92	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
93	.endm
94
95	.macro		__pmull2_p8_SHASH, rq, ad
96	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
97	.endm
98
99	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
100	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
101	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
102	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
103	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
104	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
105	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
106	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
107	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
108
109	eor		t3.16b, t3.16b, t4.16b			// L = E + F
110	eor		t5.16b, t5.16b, t6.16b			// M = G + H
111	eor		t7.16b, t7.16b, t8.16b			// N = I + J
112
113	uzp1		t4.2d, t3.2d, t5.2d
114	uzp2		t3.2d, t3.2d, t5.2d
115	uzp1		t6.2d, t7.2d, t9.2d
116	uzp2		t7.2d, t7.2d, t9.2d
117
118	// t3 = (L) (P0 + P1) << 8
119	// t5 = (M) (P2 + P3) << 16
120	eor		t4.16b, t4.16b, t3.16b
121	and		t3.16b, t3.16b, k32_48.16b
122
123	// t7 = (N) (P4 + P5) << 24
124	// t9 = (K) (P6 + P7) << 32
125	eor		t6.16b, t6.16b, t7.16b
126	and		t7.16b, t7.16b, k00_16.16b
127
128	eor		t4.16b, t4.16b, t3.16b
129	eor		t6.16b, t6.16b, t7.16b
130
131	zip2		t5.2d, t4.2d, t3.2d
132	zip1		t3.2d, t4.2d, t3.2d
133	zip2		t9.2d, t6.2d, t7.2d
134	zip1		t7.2d, t6.2d, t7.2d
135
136	ext		t3.16b, t3.16b, t3.16b, #15
137	ext		t5.16b, t5.16b, t5.16b, #14
138	ext		t7.16b, t7.16b, t7.16b, #13
139	ext		t9.16b, t9.16b, t9.16b, #12
140
141	eor		t3.16b, t3.16b, t5.16b
142	eor		t7.16b, t7.16b, t9.16b
143	eor		\rq\().16b, \rq\().16b, t3.16b
144	eor		\rq\().16b, \rq\().16b, t7.16b
145	.endm
146
147	.macro		__pmull_pre_p64
148	add		x8, x3, #16
149	ld1		{HH.2d-HH4.2d}, [x8]
150
151	trn1		SHASH2.2d, SHASH.2d, HH.2d
152	trn2		T1.2d, SHASH.2d, HH.2d
153	eor		SHASH2.16b, SHASH2.16b, T1.16b
154
155	trn1		HH34.2d, HH3.2d, HH4.2d
156	trn2		T1.2d, HH3.2d, HH4.2d
157	eor		HH34.16b, HH34.16b, T1.16b
158
159	movi		MASK.16b, #0xe1
160	shl		MASK.2d, MASK.2d, #57
161	.endm
162
163	.macro		__pmull_pre_p8
164	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
165	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
166
167	// k00_16 := 0x0000000000000000_000000000000ffff
168	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
169	movi		k32_48.2d, #0xffffffff
170	mov		k32_48.h[2], k32_48.h[0]
171	ushr		k00_16.2d, k32_48.2d, #32
172
173	// prepare the permutation vectors
174	mov_q		x5, 0x080f0e0d0c0b0a09
175	movi		T1.8b, #8
176	dup		perm1.2d, x5
177	eor		perm1.16b, perm1.16b, T1.16b
178	ushr		perm2.2d, perm1.2d, #8
179	ushr		perm3.2d, perm1.2d, #16
180	ushr		T1.2d, perm1.2d, #24
181	sli		perm2.2d, perm1.2d, #56
182	sli		perm3.2d, perm1.2d, #48
183	sli		T1.2d, perm1.2d, #40
184
185	// precompute loop invariants
186	tbl		sh1.16b, {SHASH.16b}, perm1.16b
187	tbl		sh2.16b, {SHASH.16b}, perm2.16b
188	tbl		sh3.16b, {SHASH.16b}, perm3.16b
189	tbl		sh4.16b, {SHASH.16b}, T1.16b
190	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
191	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
192	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
193	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
194	.endm
195
196	//
197	// PMULL (64x64->128) based reduction for CPUs that can do
198	// it in a single instruction.
199	//
200	.macro		__pmull_reduce_p64
201	pmull		T2.1q, XL.1d, MASK.1d
202	eor		XM.16b, XM.16b, T1.16b
203
204	mov		XH.d[0], XM.d[1]
205	mov		XM.d[1], XL.d[0]
206
207	eor		XL.16b, XM.16b, T2.16b
208	ext		T2.16b, XL.16b, XL.16b, #8
209	pmull		XL.1q, XL.1d, MASK.1d
210	.endm
211
212	//
213	// Alternative reduction for CPUs that lack support for the
214	// 64x64->128 PMULL instruction
215	//
216	.macro		__pmull_reduce_p8
217	eor		XM.16b, XM.16b, T1.16b
218
219	mov		XL.d[1], XM.d[0]
220	mov		XH.d[0], XM.d[1]
221
222	shl		T1.2d, XL.2d, #57
223	shl		T2.2d, XL.2d, #62
224	eor		T2.16b, T2.16b, T1.16b
225	shl		T1.2d, XL.2d, #63
226	eor		T2.16b, T2.16b, T1.16b
227	ext		T1.16b, XL.16b, XH.16b, #8
228	eor		T2.16b, T2.16b, T1.16b
229
230	mov		XL.d[1], T2.d[0]
231	mov		XH.d[0], T2.d[1]
232
233	ushr		T2.2d, XL.2d, #1
234	eor		XH.16b, XH.16b, XL.16b
235	eor		XL.16b, XL.16b, T2.16b
236	ushr		T2.2d, T2.2d, #6
237	ushr		XL.2d, XL.2d, #1
238	.endm
239
240	.macro		__pmull_ghash, pn
241	ld1		{SHASH.2d}, [x3]
242	ld1		{XL.2d}, [x1]
243
244	__pmull_pre_\pn
245
246	/* do the head block first, if supplied */
247	cbz		x4, 0f
248	ld1		{T1.2d}, [x4]
249	mov		x4, xzr
250	b		3f
251
2520:	.ifc		\pn, p64
253	tbnz		w0, #0, 2f		// skip until #blocks is a
254	tbnz		w0, #1, 2f		// round multiple of 4
255
2561:	ld1		{XM3.16b-TT4.16b}, [x2], #64
257
258	sub		w0, w0, #4
259
260	rev64		T1.16b, XM3.16b
261	rev64		T2.16b, XH3.16b
262	rev64		TT4.16b, TT4.16b
263	rev64		TT3.16b, TT3.16b
264
265	ext		IN1.16b, TT4.16b, TT4.16b, #8
266	ext		XL3.16b, TT3.16b, TT3.16b, #8
267
268	eor		TT4.16b, TT4.16b, IN1.16b
269	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
270	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
271	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
272
273	eor		TT3.16b, TT3.16b, XL3.16b
274	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
275	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
276	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
277
278	ext		IN1.16b, T2.16b, T2.16b, #8
279	eor		XL2.16b, XL2.16b, XL3.16b
280	eor		XH2.16b, XH2.16b, XH3.16b
281	eor		XM2.16b, XM2.16b, XM3.16b
282
283	eor		T2.16b, T2.16b, IN1.16b
284	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
285	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
286	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
287
288	eor		XL2.16b, XL2.16b, XL3.16b
289	eor		XH2.16b, XH2.16b, XH3.16b
290	eor		XM2.16b, XM2.16b, XM3.16b
291
292	ext		IN1.16b, T1.16b, T1.16b, #8
293	ext		TT3.16b, XL.16b, XL.16b, #8
294	eor		XL.16b, XL.16b, IN1.16b
295	eor		T1.16b, T1.16b, TT3.16b
296
297	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
298	eor		T1.16b, T1.16b, XL.16b
299	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
300	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
301
302	eor		XL.16b, XL.16b, XL2.16b
303	eor		XH.16b, XH.16b, XH2.16b
304	eor		XM.16b, XM.16b, XM2.16b
305
306	eor		T2.16b, XL.16b, XH.16b
307	ext		T1.16b, XL.16b, XH.16b, #8
308	eor		XM.16b, XM.16b, T2.16b
309
310	__pmull_reduce_p64
311
312	eor		T2.16b, T2.16b, XH.16b
313	eor		XL.16b, XL.16b, T2.16b
314
315	cbz		w0, 5f
316	b		1b
317	.endif
318
3192:	ld1		{T1.2d}, [x2], #16
320	sub		w0, w0, #1
321
3223:	/* multiply XL by SHASH in GF(2^128) */
323CPU_LE(	rev64		T1.16b, T1.16b	)
324
325	ext		T2.16b, XL.16b, XL.16b, #8
326	ext		IN1.16b, T1.16b, T1.16b, #8
327	eor		T1.16b, T1.16b, T2.16b
328	eor		XL.16b, XL.16b, IN1.16b
329
330	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
331	eor		T1.16b, T1.16b, XL.16b
332	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
333	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
334
3354:	eor		T2.16b, XL.16b, XH.16b
336	ext		T1.16b, XL.16b, XH.16b, #8
337	eor		XM.16b, XM.16b, T2.16b
338
339	__pmull_reduce_\pn
340
341	eor		T2.16b, T2.16b, XH.16b
342	eor		XL.16b, XL.16b, T2.16b
343
344	cbnz		w0, 0b
345
3465:	st1		{XL.2d}, [x1]
347	ret
348	.endm
349
350	/*
351	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
352	 *			   struct ghash_key const *k, const char *head)
353	 */
354SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
355	__pmull_ghash	p64
356SYM_FUNC_END(pmull_ghash_update_p64)
357
358SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
359	__pmull_ghash	p8
360SYM_FUNC_END(pmull_ghash_update_p8)
361
362	KS0		.req	v8
363	KS1		.req	v9
364	KS2		.req	v10
365	KS3		.req	v11
366
367	INP0		.req	v21
368	INP1		.req	v22
369	INP2		.req	v23
370	INP3		.req	v24
371
372	K0		.req	v25
373	K1		.req	v26
374	K2		.req	v27
375	K3		.req	v28
376	K4		.req	v12
377	K5		.req	v13
378	K6		.req	v4
379	K7		.req	v5
380	K8		.req	v14
381	K9		.req	v15
382	KK		.req	v29
383	KL		.req	v30
384	KM		.req	v31
385
386	.macro		load_round_keys, rounds, rk, tmp
387	add		\tmp, \rk, #64
388	ld1		{K0.4s-K3.4s}, [\rk]
389	ld1		{K4.4s-K5.4s}, [\tmp]
390	add		\tmp, \rk, \rounds, lsl #4
391	sub		\tmp, \tmp, #32
392	ld1		{KK.4s-KM.4s}, [\tmp]
393	.endm
394
395	.macro		enc_round, state, key
396	aese		\state\().16b, \key\().16b
397	aesmc		\state\().16b, \state\().16b
398	.endm
399
400	.macro		enc_qround, s0, s1, s2, s3, key
401	enc_round	\s0, \key
402	enc_round	\s1, \key
403	enc_round	\s2, \key
404	enc_round	\s3, \key
405	.endm
406
407	.macro		enc_block, state, rounds, rk, tmp
408	add		\tmp, \rk, #96
409	ld1		{K6.4s-K7.4s}, [\tmp], #32
410	.irp		key, K0, K1, K2, K3, K4 K5
411	enc_round	\state, \key
412	.endr
413
414	tbnz		\rounds, #2, .Lnot128_\@
415.Lout256_\@:
416	enc_round	\state, K6
417	enc_round	\state, K7
418
419.Lout192_\@:
420	enc_round	\state, KK
421	aese		\state\().16b, KL.16b
422	eor		\state\().16b, \state\().16b, KM.16b
423
424	.subsection	1
425.Lnot128_\@:
426	ld1		{K8.4s-K9.4s}, [\tmp], #32
427	enc_round	\state, K6
428	enc_round	\state, K7
429	ld1		{K6.4s-K7.4s}, [\tmp]
430	enc_round	\state, K8
431	enc_round	\state, K9
432	tbz		\rounds, #1, .Lout192_\@
433	b		.Lout256_\@
434	.previous
435	.endm
436
437	.align		6
438	.macro		pmull_gcm_do_crypt, enc
439	stp		x29, x30, [sp, #-32]!
440	mov		x29, sp
441	str		x19, [sp, #24]
442
443	load_round_keys	x7, x6, x8
444
445	ld1		{SHASH.2d}, [x3], #16
446	ld1		{HH.2d-HH4.2d}, [x3]
447
448	trn1		SHASH2.2d, SHASH.2d, HH.2d
449	trn2		T1.2d, SHASH.2d, HH.2d
450	eor		SHASH2.16b, SHASH2.16b, T1.16b
451
452	trn1		HH34.2d, HH3.2d, HH4.2d
453	trn2		T1.2d, HH3.2d, HH4.2d
454	eor		HH34.16b, HH34.16b, T1.16b
455
456	ld1		{XL.2d}, [x4]
457
458	cbz		x0, 3f				// tag only?
459
460	ldr		w8, [x5, #12]			// load lower counter
461CPU_LE(	rev		w8, w8		)
462
4630:	mov		w9, #4				// max blocks per round
464	add		x10, x0, #0xf
465	lsr		x10, x10, #4			// remaining blocks
466
467	subs		x0, x0, #64
468	csel		w9, w10, w9, mi
469	add		w8, w8, w9
470
471	bmi		1f
472	ld1		{INP0.16b-INP3.16b}, [x2], #64
473	.subsection	1
474	/*
475	 * Populate the four input registers right to left with up to 63 bytes
476	 * of data, using overlapping loads to avoid branches.
477	 *
478	 *                INP0     INP1     INP2     INP3
479	 *  1 byte     |        |        |        |x       |
480	 * 16 bytes    |        |        |        |xxxxxxxx|
481	 * 17 bytes    |        |        |xxxxxxxx|x       |
482	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
483	 * etc etc
484	 *
485	 * Note that this code may read up to 15 bytes before the start of
486	 * the input. It is up to the calling code to ensure this is safe if
487	 * this happens in the first iteration of the loop (i.e., when the
488	 * input size is < 16 bytes)
489	 */
4901:	mov		x15, #16
491	ands		x19, x0, #0xf
492	csel		x19, x19, x15, ne
493	adr_l		x17, .Lpermute_table + 16
494
495	sub		x11, x15, x19
496	add		x12, x17, x11
497	sub		x17, x17, x11
498	ld1		{T1.16b}, [x12]
499	sub		x10, x1, x11
500	sub		x11, x2, x11
501
502	cmp		x0, #-16
503	csel		x14, x15, xzr, gt
504	cmp		x0, #-32
505	csel		x15, x15, xzr, gt
506	cmp		x0, #-48
507	csel		x16, x19, xzr, gt
508	csel		x1, x1, x10, gt
509	csel		x2, x2, x11, gt
510
511	ld1		{INP0.16b}, [x2], x14
512	ld1		{INP1.16b}, [x2], x15
513	ld1		{INP2.16b}, [x2], x16
514	ld1		{INP3.16b}, [x2]
515	tbl		INP3.16b, {INP3.16b}, T1.16b
516	b		2f
517	.previous
518
5192:	.if		\enc == 0
520	bl		pmull_gcm_ghash_4x
521	.endif
522
523	bl		pmull_gcm_enc_4x
524
525	tbnz		x0, #63, 6f
526	st1		{INP0.16b-INP3.16b}, [x1], #64
527	.if		\enc == 1
528	bl		pmull_gcm_ghash_4x
529	.endif
530	bne		0b
531
5323:	ldp		x19, x10, [sp, #24]
533	cbz		x10, 5f				// output tag?
534
535	ld1		{INP3.16b}, [x10]		// load lengths[]
536	mov		w9, #1
537	bl		pmull_gcm_ghash_4x
538
539	mov		w11, #(0x1 << 24)		// BE '1U'
540	ld1		{KS0.16b}, [x5]
541	mov		KS0.s[3], w11
542
543	enc_block	KS0, x7, x6, x12
544
545	ext		XL.16b, XL.16b, XL.16b, #8
546	rev64		XL.16b, XL.16b
547	eor		XL.16b, XL.16b, KS0.16b
548
549	.if		\enc == 1
550	st1		{XL.16b}, [x10]			// store tag
551	.else
552	ldp		x11, x12, [sp, #40]		// load tag pointer and authsize
553	adr_l		x17, .Lpermute_table
554	ld1		{KS0.16b}, [x11]		// load supplied tag
555	add		x17, x17, x12
556	ld1		{KS1.16b}, [x17]		// load permute vector
557
558	cmeq		XL.16b, XL.16b, KS0.16b		// compare tags
559	mvn		XL.16b, XL.16b			// -1 for fail, 0 for pass
560	tbl		XL.16b, {XL.16b}, KS1.16b	// keep authsize bytes only
561	sminv		b0, XL.16b			// signed minimum across XL
562	smov		w0, v0.b[0]			// return b0
563	.endif
564
5654:	ldp		x29, x30, [sp], #32
566	ret
567
5685:
569CPU_LE(	rev		w8, w8		)
570	str		w8, [x5, #12]			// store lower counter
571	st1		{XL.2d}, [x4]
572	b		4b
573
5746:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
575	sub		x17, x17, x19, lsl #1
576
577	cmp		w9, #1
578	beq		7f
579	.subsection	1
5807:	ld1		{INP2.16b}, [x1]
581	tbx		INP2.16b, {INP3.16b}, T1.16b
582	mov		INP3.16b, INP2.16b
583	b		8f
584	.previous
585
586	st1		{INP0.16b}, [x1], x14
587	st1		{INP1.16b}, [x1], x15
588	st1		{INP2.16b}, [x1], x16
589	tbl		INP3.16b, {INP3.16b}, T1.16b
590	tbx		INP3.16b, {INP2.16b}, T2.16b
5918:	st1		{INP3.16b}, [x1]
592
593	.if		\enc == 1
594	ld1		{T1.16b}, [x17]
595	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
596	bl		pmull_gcm_ghash_4x
597	.endif
598	b		3b
599	.endm
600
601	/*
602	 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
603	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
604	 *			  int rounds, u8 tag)
605	 */
606SYM_FUNC_START(pmull_gcm_encrypt)
607	pmull_gcm_do_crypt	1
608SYM_FUNC_END(pmull_gcm_encrypt)
609
610	/*
611	 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
612	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
613	 *			  int rounds, u8 tag)
614	 */
615SYM_FUNC_START(pmull_gcm_decrypt)
616	pmull_gcm_do_crypt	0
617SYM_FUNC_END(pmull_gcm_decrypt)
618
619SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
620	movi		MASK.16b, #0xe1
621	shl		MASK.2d, MASK.2d, #57
622
623	rev64		T1.16b, INP0.16b
624	rev64		T2.16b, INP1.16b
625	rev64		TT3.16b, INP2.16b
626	rev64		TT4.16b, INP3.16b
627
628	ext		XL.16b, XL.16b, XL.16b, #8
629
630	tbz		w9, #2, 0f			// <4 blocks?
631	.subsection	1
6320:	movi		XH2.16b, #0
633	movi		XM2.16b, #0
634	movi		XL2.16b, #0
635
636	tbz		w9, #0, 1f			// 2 blocks?
637	tbz		w9, #1, 2f			// 1 block?
638
639	eor		T2.16b, T2.16b, XL.16b
640	ext		T1.16b, T2.16b, T2.16b, #8
641	b		.Lgh3
642
6431:	eor		TT3.16b, TT3.16b, XL.16b
644	ext		T2.16b, TT3.16b, TT3.16b, #8
645	b		.Lgh2
646
6472:	eor		TT4.16b, TT4.16b, XL.16b
648	ext		IN1.16b, TT4.16b, TT4.16b, #8
649	b		.Lgh1
650	.previous
651
652	eor		T1.16b, T1.16b, XL.16b
653	ext		IN1.16b, T1.16b, T1.16b, #8
654
655	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
656	eor		T1.16b, T1.16b, IN1.16b
657	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
658	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
659
660	ext		T1.16b, T2.16b, T2.16b, #8
661.Lgh3:	eor		T2.16b, T2.16b, T1.16b
662	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
663	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
664	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
665
666	eor		XH2.16b, XH2.16b, XH.16b
667	eor		XL2.16b, XL2.16b, XL.16b
668	eor		XM2.16b, XM2.16b, XM.16b
669
670	ext		T2.16b, TT3.16b, TT3.16b, #8
671.Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
672	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
673	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
674	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
675
676	eor		XH2.16b, XH2.16b, XH.16b
677	eor		XL2.16b, XL2.16b, XL.16b
678	eor		XM2.16b, XM2.16b, XM.16b
679
680	ext		IN1.16b, TT4.16b, TT4.16b, #8
681.Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
682	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
683	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
684	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
685
686	eor		XH.16b, XH.16b, XH2.16b
687	eor		XL.16b, XL.16b, XL2.16b
688	eor		XM.16b, XM.16b, XM2.16b
689
690	eor		T2.16b, XL.16b, XH.16b
691	ext		T1.16b, XL.16b, XH.16b, #8
692	eor		XM.16b, XM.16b, T2.16b
693
694	__pmull_reduce_p64
695
696	eor		T2.16b, T2.16b, XH.16b
697	eor		XL.16b, XL.16b, T2.16b
698
699	ret
700SYM_FUNC_END(pmull_gcm_ghash_4x)
701
702SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
703	ld1		{KS0.16b}, [x5]			// load upper counter
704	sub		w10, w8, #4
705	sub		w11, w8, #3
706	sub		w12, w8, #2
707	sub		w13, w8, #1
708	rev		w10, w10
709	rev		w11, w11
710	rev		w12, w12
711	rev		w13, w13
712	mov		KS1.16b, KS0.16b
713	mov		KS2.16b, KS0.16b
714	mov		KS3.16b, KS0.16b
715	ins		KS0.s[3], w10			// set lower counter
716	ins		KS1.s[3], w11
717	ins		KS2.s[3], w12
718	ins		KS3.s[3], w13
719
720	add		x10, x6, #96			// round key pointer
721	ld1		{K6.4s-K7.4s}, [x10], #32
722	.irp		key, K0, K1, K2, K3, K4, K5
723	enc_qround	KS0, KS1, KS2, KS3, \key
724	.endr
725
726	tbnz		x7, #2, .Lnot128
727	.subsection	1
728.Lnot128:
729	ld1		{K8.4s-K9.4s}, [x10], #32
730	.irp		key, K6, K7
731	enc_qround	KS0, KS1, KS2, KS3, \key
732	.endr
733	ld1		{K6.4s-K7.4s}, [x10]
734	.irp		key, K8, K9
735	enc_qround	KS0, KS1, KS2, KS3, \key
736	.endr
737	tbz		x7, #1, .Lout192
738	b		.Lout256
739	.previous
740
741.Lout256:
742	.irp		key, K6, K7
743	enc_qround	KS0, KS1, KS2, KS3, \key
744	.endr
745
746.Lout192:
747	enc_qround	KS0, KS1, KS2, KS3, KK
748
749	aese		KS0.16b, KL.16b
750	aese		KS1.16b, KL.16b
751	aese		KS2.16b, KL.16b
752	aese		KS3.16b, KL.16b
753
754	eor		KS0.16b, KS0.16b, KM.16b
755	eor		KS1.16b, KS1.16b, KM.16b
756	eor		KS2.16b, KS2.16b, KM.16b
757	eor		KS3.16b, KS3.16b, KM.16b
758
759	eor		INP0.16b, INP0.16b, KS0.16b
760	eor		INP1.16b, INP1.16b, KS1.16b
761	eor		INP2.16b, INP2.16b, KS2.16b
762	eor		INP3.16b, INP3.16b, KS3.16b
763
764	ret
765SYM_FUNC_END(pmull_gcm_enc_4x)
766
767	.section	".rodata", "a"
768	.align		6
769.Lpermute_table:
770	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
771	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
772	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
773	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
774	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
775	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
776	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
777	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
778	.previous
779