1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
4 *
5 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	.arch		armv8-a
12	.fpu		crypto-neon-fp-armv8
13
14	SHASH		.req	q0
15	T1		.req	q1
16	XL		.req	q2
17	XM		.req	q3
18	XH		.req	q4
19	IN1		.req	q4
20
21	SHASH_L		.req	d0
22	SHASH_H		.req	d1
23	T1_L		.req	d2
24	T1_H		.req	d3
25	XL_L		.req	d4
26	XL_H		.req	d5
27	XM_L		.req	d6
28	XM_H		.req	d7
29	XH_L		.req	d8
30
31	t0l		.req	d10
32	t0h		.req	d11
33	t1l		.req	d12
34	t1h		.req	d13
35	t2l		.req	d14
36	t2h		.req	d15
37	t3l		.req	d16
38	t3h		.req	d17
39	t4l		.req	d18
40	t4h		.req	d19
41
42	t0q		.req	q5
43	t1q		.req	q6
44	t2q		.req	q7
45	t3q		.req	q8
46	t4q		.req	q9
47	T2		.req	q9
48
49	s1l		.req	d20
50	s1h		.req	d21
51	s2l		.req	d22
52	s2h		.req	d23
53	s3l		.req	d24
54	s3h		.req	d25
55	s4l		.req	d26
56	s4h		.req	d27
57
58	MASK		.req	d28
59	SHASH2_p8	.req	d28
60
61	k16		.req	d29
62	k32		.req	d30
63	k48		.req	d31
64	SHASH2_p64	.req	d31
65
66	HH		.req	q10
67	HH3		.req	q11
68	HH4		.req	q12
69	HH34		.req	q13
70
71	HH_L		.req	d20
72	HH_H		.req	d21
73	HH3_L		.req	d22
74	HH3_H		.req	d23
75	HH4_L		.req	d24
76	HH4_H		.req	d25
77	HH34_L		.req	d26
78	HH34_H		.req	d27
79	SHASH2_H	.req	d29
80
81	XL2		.req	q5
82	XM2		.req	q6
83	XH2		.req	q7
84	T3		.req	q8
85
86	XL2_L		.req	d10
87	XL2_H		.req	d11
88	XM2_L		.req	d12
89	XM2_H		.req	d13
90	T3_L		.req	d16
91	T3_H		.req	d17
92
93	.text
94
95	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
96	vmull.p64	\rd, \rn, \rm
97	.endm
98
99	/*
100	 * This implementation of 64x64 -> 128 bit polynomial multiplication
101	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
102	 * "Fast Software Polynomial Multiplication on ARM Processors Using
103	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
104	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
105	 *
106	 * It has been slightly tweaked for in-order performance, and to allow
107	 * 'rq' to overlap with 'ad' or 'bd'.
108	 */
109	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
110	vext.8		t0l, \ad, \ad, #1	@ A1
111	.ifc		\b1, t4l
112	vext.8		t4l, \bd, \bd, #1	@ B1
113	.endif
114	vmull.p8	t0q, t0l, \bd		@ F = A1*B
115	vext.8		t1l, \ad, \ad, #2	@ A2
116	vmull.p8	t4q, \ad, \b1		@ E = A*B1
117	.ifc		\b2, t3l
118	vext.8		t3l, \bd, \bd, #2	@ B2
119	.endif
120	vmull.p8	t1q, t1l, \bd		@ H = A2*B
121	vext.8		t2l, \ad, \ad, #3	@ A3
122	vmull.p8	t3q, \ad, \b2		@ G = A*B2
123	veor		t0q, t0q, t4q		@ L = E + F
124	.ifc		\b3, t4l
125	vext.8		t4l, \bd, \bd, #3	@ B3
126	.endif
127	vmull.p8	t2q, t2l, \bd		@ J = A3*B
128	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
129	veor		t1q, t1q, t3q		@ M = G + H
130	.ifc		\b4, t3l
131	vext.8		t3l, \bd, \bd, #4	@ B4
132	.endif
133	vmull.p8	t4q, \ad, \b3		@ I = A*B3
134	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
135	vmull.p8	t3q, \ad, \b4		@ K = A*B4
136	vand		t0h, t0h, k48
137	vand		t1h, t1h, k32
138	veor		t2q, t2q, t4q		@ N = I + J
139	veor		t0l, t0l, t0h
140	veor		t1l, t1l, t1h
141	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
142	vand		t2h, t2h, k16
143	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
144	vmov.i64	t3h, #0
145	vext.8		t0q, t0q, t0q, #15
146	veor		t2l, t2l, t2h
147	vext.8		t1q, t1q, t1q, #14
148	vmull.p8	\rq, \ad, \bd		@ D = A*B
149	vext.8		t2q, t2q, t2q, #13
150	vext.8		t3q, t3q, t3q, #12
151	veor		t0q, t0q, t1q
152	veor		t2q, t2q, t3q
153	veor		\rq, \rq, t0q
154	veor		\rq, \rq, t2q
155	.endm
156
157	//
158	// PMULL (64x64->128) based reduction for CPUs that can do
159	// it in a single instruction.
160	//
161	.macro		__pmull_reduce_p64
162	vmull.p64	T1, XL_L, MASK
163
164	veor		XH_L, XH_L, XM_H
165	vext.8		T1, T1, T1, #8
166	veor		XL_H, XL_H, XM_L
167	veor		T1, T1, XL
168
169	vmull.p64	XL, T1_H, MASK
170	.endm
171
172	//
173	// Alternative reduction for CPUs that lack support for the
174	// 64x64->128 PMULL instruction
175	//
176	.macro		__pmull_reduce_p8
177	veor		XL_H, XL_H, XM_L
178	veor		XH_L, XH_L, XM_H
179
180	vshl.i64	T1, XL, #57
181	vshl.i64	T2, XL, #62
182	veor		T1, T1, T2
183	vshl.i64	T2, XL, #63
184	veor		T1, T1, T2
185	veor		XL_H, XL_H, T1_L
186	veor		XH_L, XH_L, T1_H
187
188	vshr.u64	T1, XL, #1
189	veor		XH, XH, XL
190	veor		XL, XL, T1
191	vshr.u64	T1, T1, #6
192	vshr.u64	XL, XL, #1
193	.endm
194
195	.macro		ghash_update, pn
196	vld1.64		{XL}, [r1]
197
198	/* do the head block first, if supplied */
199	ldr		ip, [sp]
200	teq		ip, #0
201	beq		0f
202	vld1.64		{T1}, [ip]
203	teq		r0, #0
204	b		3f
205
2060:	.ifc		\pn, p64
207	tst		r0, #3			// skip until #blocks is a
208	bne		2f			// round multiple of 4
209
210	vld1.8		{XL2-XM2}, [r2]!
2111:	vld1.8		{T3-T2}, [r2]!
212	vrev64.8	XL2, XL2
213	vrev64.8	XM2, XM2
214
215	subs		r0, r0, #4
216
217	vext.8		T1, XL2, XL2, #8
218	veor		XL2_H, XL2_H, XL_L
219	veor		XL, XL, T1
220
221	vrev64.8	T3, T3
222	vrev64.8	T1, T2
223
224	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
225	veor		XL2_H, XL2_H, XL_H
226	vmull.p64	XL, HH4_L, XL_L			// a0 * b0
227	vmull.p64	XM, HH34_H, XL2_H		// (a1 + a0)(b1 + b0)
228
229	vmull.p64	XH2, HH3_H, XM2_L		// a1 * b1
230	veor		XM2_L, XM2_L, XM2_H
231	vmull.p64	XL2, HH3_L, XM2_H		// a0 * b0
232	vmull.p64	XM2, HH34_L, XM2_L		// (a1 + a0)(b1 + b0)
233
234	veor		XH, XH, XH2
235	veor		XL, XL, XL2
236	veor		XM, XM, XM2
237
238	vmull.p64	XH2, HH_H, T3_L			// a1 * b1
239	veor		T3_L, T3_L, T3_H
240	vmull.p64	XL2, HH_L, T3_H			// a0 * b0
241	vmull.p64	XM2, SHASH2_H, T3_L		// (a1 + a0)(b1 + b0)
242
243	veor		XH, XH, XH2
244	veor		XL, XL, XL2
245	veor		XM, XM, XM2
246
247	vmull.p64	XH2, SHASH_H, T1_L		// a1 * b1
248	veor		T1_L, T1_L, T1_H
249	vmull.p64	XL2, SHASH_L, T1_H		// a0 * b0
250	vmull.p64	XM2, SHASH2_p64, T1_L		// (a1 + a0)(b1 + b0)
251
252	veor		XH, XH, XH2
253	veor		XL, XL, XL2
254	veor		XM, XM, XM2
255
256	beq		4f
257
258	vld1.8		{XL2-XM2}, [r2]!
259
260	veor		T1, XL, XH
261	veor		XM, XM, T1
262
263	__pmull_reduce_p64
264
265	veor		T1, T1, XH
266	veor		XL, XL, T1
267
268	b		1b
269	.endif
270
2712:	vld1.64		{T1}, [r2]!
272	subs		r0, r0, #1
273
2743:	/* multiply XL by SHASH in GF(2^128) */
275#ifndef CONFIG_CPU_BIG_ENDIAN
276	vrev64.8	T1, T1
277#endif
278	vext.8		IN1, T1, T1, #8
279	veor		T1_L, T1_L, XL_H
280	veor		XL, XL, IN1
281
282	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
283	veor		T1, T1, XL
284	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
285	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
286
2874:	veor		T1, XL, XH
288	veor		XM, XM, T1
289
290	__pmull_reduce_\pn
291
292	veor		T1, T1, XH
293	veor		XL, XL, T1
294
295	bne		0b
296
297	vst1.64		{XL}, [r1]
298	bx		lr
299	.endm
300
301	/*
302	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
303	 *			   struct ghash_key const *k, const char *head)
304	 */
305ENTRY(pmull_ghash_update_p64)
306	vld1.64		{SHASH}, [r3]!
307	vld1.64		{HH}, [r3]!
308	vld1.64		{HH3-HH4}, [r3]
309
310	veor		SHASH2_p64, SHASH_L, SHASH_H
311	veor		SHASH2_H, HH_L, HH_H
312	veor		HH34_L, HH3_L, HH3_H
313	veor		HH34_H, HH4_L, HH4_H
314
315	vmov.i8		MASK, #0xe1
316	vshl.u64	MASK, MASK, #57
317
318	ghash_update	p64
319ENDPROC(pmull_ghash_update_p64)
320
321ENTRY(pmull_ghash_update_p8)
322	vld1.64		{SHASH}, [r3]
323	veor		SHASH2_p8, SHASH_L, SHASH_H
324
325	vext.8		s1l, SHASH_L, SHASH_L, #1
326	vext.8		s2l, SHASH_L, SHASH_L, #2
327	vext.8		s3l, SHASH_L, SHASH_L, #3
328	vext.8		s4l, SHASH_L, SHASH_L, #4
329	vext.8		s1h, SHASH_H, SHASH_H, #1
330	vext.8		s2h, SHASH_H, SHASH_H, #2
331	vext.8		s3h, SHASH_H, SHASH_H, #3
332	vext.8		s4h, SHASH_H, SHASH_H, #4
333
334	vmov.i64	k16, #0xffff
335	vmov.i64	k32, #0xffffffff
336	vmov.i64	k48, #0xffffffffffff
337
338	ghash_update	p8
339ENDPROC(pmull_ghash_update_p8)
340