1#include "arm_arch.h"
2
3.text
4.syntax	unified
5.code	32
6
7.type	rem_4bit,%object
8.align	5
9rem_4bit:
10.short	0x0000,0x1C20,0x3840,0x2460
11.short	0x7080,0x6CA0,0x48C0,0x54E0
12.short	0xE100,0xFD20,0xD940,0xC560
13.short	0x9180,0x8DA0,0xA9C0,0xB5E0
14.size	rem_4bit,.-rem_4bit
15
16.type	rem_4bit_get,%function
17rem_4bit_get:
18	sub	r2,pc,#8
19	sub	r2,r2,#32	@ &rem_4bit
20	b	.Lrem_4bit_got
21	nop
22.size	rem_4bit_get,.-rem_4bit_get
23
24.global	gcm_ghash_4bit
25.type	gcm_ghash_4bit,%function
26gcm_ghash_4bit:
27	sub	r12,pc,#8
28	add	r3,r2,r3		@ r3 to point at the end
29	stmdb	sp!,{r3-r11,lr}		@ save r3/end too
30	sub	r12,r12,#48		@ &rem_4bit
31
32	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
33	stmdb	sp!,{r4-r11}		@ ... to stack
34
35	ldrb	r12,[r2,#15]
36	ldrb	r14,[r0,#15]
37.Louter:
38	eor	r12,r12,r14
39	and	r14,r12,#0xf0
40	and	r12,r12,#0x0f
41	mov	r3,#14
42
43	add	r7,r1,r12,lsl#4
44	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
45	add	r11,r1,r14
46	ldrb	r12,[r2,#14]
47
48	and	r14,r4,#0xf		@ rem
49	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
50	add	r14,r14,r14
51	eor	r4,r8,r4,lsr#4
52	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
53	eor	r4,r4,r5,lsl#28
54	ldrb	r14,[r0,#14]
55	eor	r5,r9,r5,lsr#4
56	eor	r5,r5,r6,lsl#28
57	eor	r6,r10,r6,lsr#4
58	eor	r6,r6,r7,lsl#28
59	eor	r7,r11,r7,lsr#4
60	eor	r12,r12,r14
61	and	r14,r12,#0xf0
62	and	r12,r12,#0x0f
63	eor	r7,r7,r8,lsl#16
64
65.Linner:
66	add	r11,r1,r12,lsl#4
67	and	r12,r4,#0xf		@ rem
68	subs	r3,r3,#1
69	add	r12,r12,r12
70	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
71	eor	r4,r8,r4,lsr#4
72	eor	r4,r4,r5,lsl#28
73	eor	r5,r9,r5,lsr#4
74	eor	r5,r5,r6,lsl#28
75	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
76	eor	r6,r10,r6,lsr#4
77	ldrbpl	r12,[r2,r3]
78	eor	r6,r6,r7,lsl#28
79	eor	r7,r11,r7,lsr#4
80
81	add	r11,r1,r14
82	and	r14,r4,#0xf		@ rem
83	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
84	add	r14,r14,r14
85	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
86	eor	r4,r8,r4,lsr#4
87	ldrbpl	r8,[r0,r3]
88	eor	r4,r4,r5,lsl#28
89	eor	r5,r9,r5,lsr#4
90	ldrh	r9,[sp,r14]
91	eor	r5,r5,r6,lsl#28
92	eor	r6,r10,r6,lsr#4
93	eor	r6,r6,r7,lsl#28
94	eorpl	r12,r12,r8
95	eor	r7,r11,r7,lsr#4
96	andpl	r14,r12,#0xf0
97	andpl	r12,r12,#0x0f
98	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
99	bpl	.Linner
100
101	ldr	r3,[sp,#32]		@ re-load r3/end
102	add	r2,r2,#16
103	mov	r14,r4
104#if __ARM_ARCH__>=7 && defined(__ARMEL__)
105	rev	r4,r4
106	str	r4,[r0,#12]
107#elif defined(__ARMEB__)
108	str	r4,[r0,#12]
109#else
110	mov	r9,r4,lsr#8
111	strb	r4,[r0,#12+3]
112	mov	r10,r4,lsr#16
113	strb	r9,[r0,#12+2]
114	mov	r11,r4,lsr#24
115	strb	r10,[r0,#12+1]
116	strb	r11,[r0,#12]
117#endif
118	cmp	r2,r3
119#if __ARM_ARCH__>=7 && defined(__ARMEL__)
120	rev	r5,r5
121	str	r5,[r0,#8]
122#elif defined(__ARMEB__)
123	str	r5,[r0,#8]
124#else
125	mov	r9,r5,lsr#8
126	strb	r5,[r0,#8+3]
127	mov	r10,r5,lsr#16
128	strb	r9,[r0,#8+2]
129	mov	r11,r5,lsr#24
130	strb	r10,[r0,#8+1]
131	strb	r11,[r0,#8]
132#endif
133	ldrbne	r12,[r2,#15]
134#if __ARM_ARCH__>=7 && defined(__ARMEL__)
135	rev	r6,r6
136	str	r6,[r0,#4]
137#elif defined(__ARMEB__)
138	str	r6,[r0,#4]
139#else
140	mov	r9,r6,lsr#8
141	strb	r6,[r0,#4+3]
142	mov	r10,r6,lsr#16
143	strb	r9,[r0,#4+2]
144	mov	r11,r6,lsr#24
145	strb	r10,[r0,#4+1]
146	strb	r11,[r0,#4]
147#endif
148
149#if __ARM_ARCH__>=7 && defined(__ARMEL__)
150	rev	r7,r7
151	str	r7,[r0,#0]
152#elif defined(__ARMEB__)
153	str	r7,[r0,#0]
154#else
155	mov	r9,r7,lsr#8
156	strb	r7,[r0,#0+3]
157	mov	r10,r7,lsr#16
158	strb	r9,[r0,#0+2]
159	mov	r11,r7,lsr#24
160	strb	r10,[r0,#0+1]
161	strb	r11,[r0,#0]
162#endif
163
164	bne	.Louter
165
166	add	sp,sp,#36
167#if __ARM_ARCH__>=5
168	ldmia	sp!,{r4-r11,pc}
169#else
170	ldmia	sp!,{r4-r11,lr}
171	tst	lr,#1
172	moveq	pc,lr			@ be binary compatible with V4, yet
173	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
174#endif
175.size	gcm_ghash_4bit,.-gcm_ghash_4bit
176
177.global	gcm_gmult_4bit
178.type	gcm_gmult_4bit,%function
179gcm_gmult_4bit:
180	stmdb	sp!,{r4-r11,lr}
181	ldrb	r12,[r0,#15]
182	b	rem_4bit_get
183.Lrem_4bit_got:
184	and	r14,r12,#0xf0
185	and	r12,r12,#0x0f
186	mov	r3,#14
187
188	add	r7,r1,r12,lsl#4
189	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
190	ldrb	r12,[r0,#14]
191
192	add	r11,r1,r14
193	and	r14,r4,#0xf		@ rem
194	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
195	add	r14,r14,r14
196	eor	r4,r8,r4,lsr#4
197	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
198	eor	r4,r4,r5,lsl#28
199	eor	r5,r9,r5,lsr#4
200	eor	r5,r5,r6,lsl#28
201	eor	r6,r10,r6,lsr#4
202	eor	r6,r6,r7,lsl#28
203	eor	r7,r11,r7,lsr#4
204	and	r14,r12,#0xf0
205	eor	r7,r7,r8,lsl#16
206	and	r12,r12,#0x0f
207
208.Loop:
209	add	r11,r1,r12,lsl#4
210	and	r12,r4,#0xf		@ rem
211	subs	r3,r3,#1
212	add	r12,r12,r12
213	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
214	eor	r4,r8,r4,lsr#4
215	eor	r4,r4,r5,lsl#28
216	eor	r5,r9,r5,lsr#4
217	eor	r5,r5,r6,lsl#28
218	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
219	eor	r6,r10,r6,lsr#4
220	ldrbpl	r12,[r0,r3]
221	eor	r6,r6,r7,lsl#28
222	eor	r7,r11,r7,lsr#4
223
224	add	r11,r1,r14
225	and	r14,r4,#0xf		@ rem
226	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
227	add	r14,r14,r14
228	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
229	eor	r4,r8,r4,lsr#4
230	eor	r4,r4,r5,lsl#28
231	eor	r5,r9,r5,lsr#4
232	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
233	eor	r5,r5,r6,lsl#28
234	eor	r6,r10,r6,lsr#4
235	eor	r6,r6,r7,lsl#28
236	eor	r7,r11,r7,lsr#4
237	andpl	r14,r12,#0xf0
238	andpl	r12,r12,#0x0f
239	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
240	bpl	.Loop
241#if __ARM_ARCH__>=7 && defined(__ARMEL__)
242	rev	r4,r4
243	str	r4,[r0,#12]
244#elif defined(__ARMEB__)
245	str	r4,[r0,#12]
246#else
247	mov	r9,r4,lsr#8
248	strb	r4,[r0,#12+3]
249	mov	r10,r4,lsr#16
250	strb	r9,[r0,#12+2]
251	mov	r11,r4,lsr#24
252	strb	r10,[r0,#12+1]
253	strb	r11,[r0,#12]
254#endif
255
256#if __ARM_ARCH__>=7 && defined(__ARMEL__)
257	rev	r5,r5
258	str	r5,[r0,#8]
259#elif defined(__ARMEB__)
260	str	r5,[r0,#8]
261#else
262	mov	r9,r5,lsr#8
263	strb	r5,[r0,#8+3]
264	mov	r10,r5,lsr#16
265	strb	r9,[r0,#8+2]
266	mov	r11,r5,lsr#24
267	strb	r10,[r0,#8+1]
268	strb	r11,[r0,#8]
269#endif
270
271#if __ARM_ARCH__>=7 && defined(__ARMEL__)
272	rev	r6,r6
273	str	r6,[r0,#4]
274#elif defined(__ARMEB__)
275	str	r6,[r0,#4]
276#else
277	mov	r9,r6,lsr#8
278	strb	r6,[r0,#4+3]
279	mov	r10,r6,lsr#16
280	strb	r9,[r0,#4+2]
281	mov	r11,r6,lsr#24
282	strb	r10,[r0,#4+1]
283	strb	r11,[r0,#4]
284#endif
285
286#if __ARM_ARCH__>=7 && defined(__ARMEL__)
287	rev	r7,r7
288	str	r7,[r0,#0]
289#elif defined(__ARMEB__)
290	str	r7,[r0,#0]
291#else
292	mov	r9,r7,lsr#8
293	strb	r7,[r0,#0+3]
294	mov	r10,r7,lsr#16
295	strb	r9,[r0,#0+2]
296	mov	r11,r7,lsr#24
297	strb	r10,[r0,#0+1]
298	strb	r11,[r0,#0]
299#endif
300
301#if __ARM_ARCH__>=5
302	ldmia	sp!,{r4-r11,pc}
303#else
304	ldmia	sp!,{r4-r11,lr}
305	tst	lr,#1
306	moveq	pc,lr			@ be binary compatible with V4, yet
307	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
308#endif
309.size	gcm_gmult_4bit,.-gcm_gmult_4bit
310#if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
311.fpu	neon
312
313.global	gcm_gmult_neon
314.type	gcm_gmult_neon,%function
315.align	4
316gcm_gmult_neon:
317	sub		r1,#16		@ point at H in GCM128_CTX
318	vld1.64		d29,[r0,:64]!@ load Xi
319	vmov.i32	d5,#0xe1		@ our irreducible polynomial
320	vld1.64		d28,[r0,:64]!
321	vshr.u64	d5,#32
322	vldmia		r1,{d0-d1}	@ load H
323	veor		q12,q12
324#ifdef __ARMEL__
325	vrev64.8	q14,q14
326#endif
327	veor		q13,q13
328	veor		q11,q11
329	mov		r1,#16
330	veor		q10,q10
331	mov		r3,#16
332	veor		d2,d2
333	vdup.8		d4,d28[0]	@ broadcast lowest byte
334	b		.Linner_neon
335.size	gcm_gmult_neon,.-gcm_gmult_neon
336
337.global	gcm_ghash_neon
338.type	gcm_ghash_neon,%function
339.align	4
340gcm_ghash_neon:
341	vld1.64		d21,[r0,:64]!	@ load Xi
342	vmov.i32	d5,#0xe1		@ our irreducible polynomial
343	vld1.64		d20,[r0,:64]!
344	vshr.u64	d5,#32
345	vldmia		r0,{d0-d1}		@ load H
346	veor		q12,q12
347	nop
348#ifdef __ARMEL__
349	vrev64.8	q10,q10
350#endif
351.Louter_neon:
352	vld1.64		d29,[r2]!	@ load inp
353	veor		q13,q13
354	vld1.64		d28,[r2]!
355	veor		q11,q11
356	mov		r1,#16
357#ifdef __ARMEL__
358	vrev64.8	q14,q14
359#endif
360	veor		d2,d2
361	veor		q14,q10			@ inp^=Xi
362	veor		q10,q10
363	vdup.8		d4,d28[0]	@ broadcast lowest byte
364.Linner_neon:
365	subs		r1,r1,#1
366	vmull.p8	q9,d1,d4		@ H.loXi[i]
367	vmull.p8	q8,d0,d4		@ H.hiXi[i]
368	vext.8		q14,q12,#1		@ IN>>=8
369
370	veor		q10,q13		@ modulo-scheduled part
371	vshl.i64	d22,#48
372	vdup.8		d4,d28[0]	@ broadcast lowest byte
373	veor		d3,d18,d20
374
375	veor		d21,d22
376	vuzp.8		q9,q8
377	vsli.8		d2,d3,#1		@ compose the "carry" byte
378	vext.8		q10,q12,#1		@ Z>>=8
379
380	vmull.p8	q11,d2,d5		@ "carry"0xe1
381	vshr.u8		d2,d3,#7		@ save Z's bottom bit
382	vext.8		q13,q9,q12,#1	@ Qlo>>=8
383	veor		q10,q8
384	bne		.Linner_neon
385
386	veor		q10,q13		@ modulo-scheduled artefact
387	vshl.i64	d22,#48
388	veor		d21,d22
389
390	@ finalization, normalize Z:Zo
391	vand		d2,d5		@ suffices to mask the bit
392	vshr.u64	d3,d20,#63
393	vshl.i64	q10,#1
394	subs		r3,#16
395	vorr		q10,q1		@ Z=Z:Zo<<1
396	bne		.Louter_neon
397
398#ifdef __ARMEL__
399	vrev64.8	q10,q10
400#endif
401	sub		r0,#16
402	vst1.64		d21,[r0,:64]!	@ write out Xi
403	vst1.64		d20,[r0,:64]
404
405	.word	0xe12fff1e
406.size	gcm_ghash_neon,.-gcm_ghash_neon
407#endif
408.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
409.align  2
410#if defined(HAVE_GNU_STACK)
411.section .note.GNU-stack,"",%progbits
412#endif
413