1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if !defined(OPENSSL_NO_ASM)
11#include <GFp/arm_arch.h>
12
13@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
14@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
15@ instructions are in aesv8-armx.pl.)
16
17
18.text
19#if defined(__thumb2__) || defined(__clang__)
20.syntax	unified
21#endif
22#if defined(__thumb2__)
23.thumb
24#else
25.code	32
26#endif
27
28#ifdef  __clang__
29#define ldrplb  ldrbpl
30#define ldrneb  ldrbne
31#endif
32
33
34.align	5
35rem_4bit:
36.short	0x0000,0x1C20,0x3840,0x2460
37.short	0x7080,0x6CA0,0x48C0,0x54E0
38.short	0xE100,0xFD20,0xD940,0xC560
39.short	0x9180,0x8DA0,0xA9C0,0xB5E0
40
41
42#ifdef __thumb2__
43.thumb_func	rem_4bit_get
44#endif
45rem_4bit_get:
46#if defined(__thumb2__)
47	adr	r2,rem_4bit
48#else
49	sub	r2,pc,#8+32	@ &rem_4bit
50#endif
51	b	Lrem_4bit_got
52	nop
53	nop
54
55
56.globl	_GFp_gcm_ghash_4bit
57.private_extern	_GFp_gcm_ghash_4bit
58#ifdef __thumb2__
59.thumb_func	_GFp_gcm_ghash_4bit
60#endif
61.align	4
62_GFp_gcm_ghash_4bit:
63#if defined(__thumb2__)
64	adr	r12,rem_4bit
65#else
66	sub	r12,pc,#8+48		@ &rem_4bit
67#endif
68	add	r3,r2,r3		@ r3 to point at the end
69	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}		@ save r3/end too
70
71	ldmia	r12,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy rem_4bit ...
72	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ ... to stack
73
74	ldrb	r12,[r2,#15]
75	ldrb	r14,[r0,#15]
76Louter:
77	eor	r12,r12,r14
78	and	r14,r12,#0xf0
79	and	r12,r12,#0x0f
80	mov	r3,#14
81
82	add	r7,r1,r12,lsl#4
83	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
84	add	r11,r1,r14
85	ldrb	r12,[r2,#14]
86
87	and	r14,r4,#0xf		@ rem
88	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
89	add	r14,r14,r14
90	eor	r4,r8,r4,lsr#4
91	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
92	eor	r4,r4,r5,lsl#28
93	ldrb	r14,[r0,#14]
94	eor	r5,r9,r5,lsr#4
95	eor	r5,r5,r6,lsl#28
96	eor	r6,r10,r6,lsr#4
97	eor	r6,r6,r7,lsl#28
98	eor	r7,r11,r7,lsr#4
99	eor	r12,r12,r14
100	and	r14,r12,#0xf0
101	and	r12,r12,#0x0f
102	eor	r7,r7,r8,lsl#16
103
104Linner:
105	add	r11,r1,r12,lsl#4
106	and	r12,r4,#0xf		@ rem
107	subs	r3,r3,#1
108	add	r12,r12,r12
109	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
110	eor	r4,r8,r4,lsr#4
111	eor	r4,r4,r5,lsl#28
112	eor	r5,r9,r5,lsr#4
113	eor	r5,r5,r6,lsl#28
114	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
115	eor	r6,r10,r6,lsr#4
116#ifdef	__thumb2__
117	it	pl
118#endif
119	ldrplb	r12,[r2,r3]
120	eor	r6,r6,r7,lsl#28
121	eor	r7,r11,r7,lsr#4
122
123	add	r11,r1,r14
124	and	r14,r4,#0xf		@ rem
125	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
126	add	r14,r14,r14
127	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
128	eor	r4,r8,r4,lsr#4
129#ifdef	__thumb2__
130	it	pl
131#endif
132	ldrplb	r8,[r0,r3]
133	eor	r4,r4,r5,lsl#28
134	eor	r5,r9,r5,lsr#4
135	ldrh	r9,[sp,r14]
136	eor	r5,r5,r6,lsl#28
137	eor	r6,r10,r6,lsr#4
138	eor	r6,r6,r7,lsl#28
139#ifdef	__thumb2__
140	it	pl
141#endif
142	eorpl	r12,r12,r8
143	eor	r7,r11,r7,lsr#4
144#ifdef	__thumb2__
145	itt	pl
146#endif
147	andpl	r14,r12,#0xf0
148	andpl	r12,r12,#0x0f
149	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
150	bpl	Linner
151
152	ldr	r3,[sp,#32]		@ re-load r3/end
153	add	r2,r2,#16
154	mov	r14,r4
155#if __ARM_ARCH__>=7 && defined(__ARMEL__)
156	rev	r4,r4
157	str	r4,[r0,#12]
158#elif defined(__ARMEB__)
159	str	r4,[r0,#12]
160#else
161	mov	r9,r4,lsr#8
162	strb	r4,[r0,#12+3]
163	mov	r10,r4,lsr#16
164	strb	r9,[r0,#12+2]
165	mov	r11,r4,lsr#24
166	strb	r10,[r0,#12+1]
167	strb	r11,[r0,#12]
168#endif
169	cmp	r2,r3
170#if __ARM_ARCH__>=7 && defined(__ARMEL__)
171	rev	r5,r5
172	str	r5,[r0,#8]
173#elif defined(__ARMEB__)
174	str	r5,[r0,#8]
175#else
176	mov	r9,r5,lsr#8
177	strb	r5,[r0,#8+3]
178	mov	r10,r5,lsr#16
179	strb	r9,[r0,#8+2]
180	mov	r11,r5,lsr#24
181	strb	r10,[r0,#8+1]
182	strb	r11,[r0,#8]
183#endif
184
185#ifdef __thumb2__
186	it	ne
187#endif
188	ldrneb	r12,[r2,#15]
189#if __ARM_ARCH__>=7 && defined(__ARMEL__)
190	rev	r6,r6
191	str	r6,[r0,#4]
192#elif defined(__ARMEB__)
193	str	r6,[r0,#4]
194#else
195	mov	r9,r6,lsr#8
196	strb	r6,[r0,#4+3]
197	mov	r10,r6,lsr#16
198	strb	r9,[r0,#4+2]
199	mov	r11,r6,lsr#24
200	strb	r10,[r0,#4+1]
201	strb	r11,[r0,#4]
202#endif
203
204#if __ARM_ARCH__>=7 && defined(__ARMEL__)
205	rev	r7,r7
206	str	r7,[r0,#0]
207#elif defined(__ARMEB__)
208	str	r7,[r0,#0]
209#else
210	mov	r9,r7,lsr#8
211	strb	r7,[r0,#0+3]
212	mov	r10,r7,lsr#16
213	strb	r9,[r0,#0+2]
214	mov	r11,r7,lsr#24
215	strb	r10,[r0,#0+1]
216	strb	r11,[r0,#0]
217#endif
218
219	bne	Louter
220
221	add	sp,sp,#36
222#if __ARM_ARCH__>=5
223	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
224#else
225	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
226	tst	lr,#1
227	moveq	pc,lr			@ be binary compatible with V4, yet
228.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
229#endif
230
231
232.globl	_GFp_gcm_gmult_4bit
233.private_extern	_GFp_gcm_gmult_4bit
234#ifdef __thumb2__
235.thumb_func	_GFp_gcm_gmult_4bit
236#endif
237_GFp_gcm_gmult_4bit:
238	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
239	ldrb	r12,[r0,#15]
240	b	rem_4bit_get
241Lrem_4bit_got:
242	and	r14,r12,#0xf0
243	and	r12,r12,#0x0f
244	mov	r3,#14
245
246	add	r7,r1,r12,lsl#4
247	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
248	ldrb	r12,[r0,#14]
249
250	add	r11,r1,r14
251	and	r14,r4,#0xf		@ rem
252	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
253	add	r14,r14,r14
254	eor	r4,r8,r4,lsr#4
255	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
256	eor	r4,r4,r5,lsl#28
257	eor	r5,r9,r5,lsr#4
258	eor	r5,r5,r6,lsl#28
259	eor	r6,r10,r6,lsr#4
260	eor	r6,r6,r7,lsl#28
261	eor	r7,r11,r7,lsr#4
262	and	r14,r12,#0xf0
263	eor	r7,r7,r8,lsl#16
264	and	r12,r12,#0x0f
265
266Loop:
267	add	r11,r1,r12,lsl#4
268	and	r12,r4,#0xf		@ rem
269	subs	r3,r3,#1
270	add	r12,r12,r12
271	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
272	eor	r4,r8,r4,lsr#4
273	eor	r4,r4,r5,lsl#28
274	eor	r5,r9,r5,lsr#4
275	eor	r5,r5,r6,lsl#28
276	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
277	eor	r6,r10,r6,lsr#4
278#ifdef	__thumb2__
279	it	pl
280#endif
281	ldrplb	r12,[r0,r3]
282	eor	r6,r6,r7,lsl#28
283	eor	r7,r11,r7,lsr#4
284
285	add	r11,r1,r14
286	and	r14,r4,#0xf		@ rem
287	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
288	add	r14,r14,r14
289	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
290	eor	r4,r8,r4,lsr#4
291	eor	r4,r4,r5,lsl#28
292	eor	r5,r9,r5,lsr#4
293	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
294	eor	r5,r5,r6,lsl#28
295	eor	r6,r10,r6,lsr#4
296	eor	r6,r6,r7,lsl#28
297	eor	r7,r11,r7,lsr#4
298#ifdef	__thumb2__
299	itt	pl
300#endif
301	andpl	r14,r12,#0xf0
302	andpl	r12,r12,#0x0f
303	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
304	bpl	Loop
305#if __ARM_ARCH__>=7 && defined(__ARMEL__)
306	rev	r4,r4
307	str	r4,[r0,#12]
308#elif defined(__ARMEB__)
309	str	r4,[r0,#12]
310#else
311	mov	r9,r4,lsr#8
312	strb	r4,[r0,#12+3]
313	mov	r10,r4,lsr#16
314	strb	r9,[r0,#12+2]
315	mov	r11,r4,lsr#24
316	strb	r10,[r0,#12+1]
317	strb	r11,[r0,#12]
318#endif
319
320#if __ARM_ARCH__>=7 && defined(__ARMEL__)
321	rev	r5,r5
322	str	r5,[r0,#8]
323#elif defined(__ARMEB__)
324	str	r5,[r0,#8]
325#else
326	mov	r9,r5,lsr#8
327	strb	r5,[r0,#8+3]
328	mov	r10,r5,lsr#16
329	strb	r9,[r0,#8+2]
330	mov	r11,r5,lsr#24
331	strb	r10,[r0,#8+1]
332	strb	r11,[r0,#8]
333#endif
334
335#if __ARM_ARCH__>=7 && defined(__ARMEL__)
336	rev	r6,r6
337	str	r6,[r0,#4]
338#elif defined(__ARMEB__)
339	str	r6,[r0,#4]
340#else
341	mov	r9,r6,lsr#8
342	strb	r6,[r0,#4+3]
343	mov	r10,r6,lsr#16
344	strb	r9,[r0,#4+2]
345	mov	r11,r6,lsr#24
346	strb	r10,[r0,#4+1]
347	strb	r11,[r0,#4]
348#endif
349
350#if __ARM_ARCH__>=7 && defined(__ARMEL__)
351	rev	r7,r7
352	str	r7,[r0,#0]
353#elif defined(__ARMEB__)
354	str	r7,[r0,#0]
355#else
356	mov	r9,r7,lsr#8
357	strb	r7,[r0,#0+3]
358	mov	r10,r7,lsr#16
359	strb	r9,[r0,#0+2]
360	mov	r11,r7,lsr#24
361	strb	r10,[r0,#0+1]
362	strb	r11,[r0,#0]
363#endif
364
365#if __ARM_ARCH__>=5
366	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
367#else
368	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
369	tst	lr,#1
370	moveq	pc,lr			@ be binary compatible with V4, yet
371.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
372#endif
373
374#if __ARM_MAX_ARCH__>=7
375
376
377
378.globl	_GFp_gcm_init_neon
379.private_extern	_GFp_gcm_init_neon
380#ifdef __thumb2__
381.thumb_func	_GFp_gcm_init_neon
382#endif
383.align	4
384_GFp_gcm_init_neon:
385	vld1.64	d7,[r1]!		@ load H
386	vmov.i8	q8,#0xe1
387	vld1.64	d6,[r1]
388	vshl.i64	d17,#57
389	vshr.u64	d16,#63		@ t0=0xc2....01
390	vdup.8	q9,d7[7]
391	vshr.u64	d26,d6,#63
392	vshr.s8	q9,#7			@ broadcast carry bit
393	vshl.i64	q3,q3,#1
394	vand	q8,q8,q9
395	vorr	d7,d26		@ H<<<=1
396	veor	q3,q3,q8		@ twisted H
397	vstmia	r0,{q3}
398
399	bx	lr					@ bx lr
400
401
402.globl	_GFp_gcm_gmult_neon
403.private_extern	_GFp_gcm_gmult_neon
404#ifdef __thumb2__
405.thumb_func	_GFp_gcm_gmult_neon
406#endif
407.align	4
408_GFp_gcm_gmult_neon:
409	vld1.64	d7,[r0]!		@ load Xi
410	vld1.64	d6,[r0]!
411	vmov.i64	d29,#0x0000ffffffffffff
412	vldmia	r1,{d26,d27}	@ load twisted H
413	vmov.i64	d30,#0x00000000ffffffff
414#ifdef __ARMEL__
415	vrev64.8	q3,q3
416#endif
417	vmov.i64	d31,#0x000000000000ffff
418	veor	d28,d26,d27		@ Karatsuba pre-processing
419	mov	r3,#16
420	b	Lgmult_neon
421
422
423.globl	_GFp_gcm_ghash_neon
424.private_extern	_GFp_gcm_ghash_neon
425#ifdef __thumb2__
426.thumb_func	_GFp_gcm_ghash_neon
427#endif
428.align	4
429_GFp_gcm_ghash_neon:
430	vld1.64	d1,[r0]!		@ load Xi
431	vld1.64	d0,[r0]!
432	vmov.i64	d29,#0x0000ffffffffffff
433	vldmia	r1,{d26,d27}	@ load twisted H
434	vmov.i64	d30,#0x00000000ffffffff
435#ifdef __ARMEL__
436	vrev64.8	q0,q0
437#endif
438	vmov.i64	d31,#0x000000000000ffff
439	veor	d28,d26,d27		@ Karatsuba pre-processing
440
441Loop_neon:
442	vld1.64	d7,[r2]!		@ load inp
443	vld1.64	d6,[r2]!
444#ifdef __ARMEL__
445	vrev64.8	q3,q3
446#endif
447	veor	q3,q0			@ inp^=Xi
448Lgmult_neon:
449	vext.8	d16, d26, d26, #1	@ A1
450	vmull.p8	q8, d16, d6		@ F = A1*B
451	vext.8	d0, d6, d6, #1	@ B1
452	vmull.p8	q0, d26, d0		@ E = A*B1
453	vext.8	d18, d26, d26, #2	@ A2
454	vmull.p8	q9, d18, d6		@ H = A2*B
455	vext.8	d22, d6, d6, #2	@ B2
456	vmull.p8	q11, d26, d22		@ G = A*B2
457	vext.8	d20, d26, d26, #3	@ A3
458	veor	q8, q8, q0		@ L = E + F
459	vmull.p8	q10, d20, d6		@ J = A3*B
460	vext.8	d0, d6, d6, #3	@ B3
461	veor	q9, q9, q11		@ M = G + H
462	vmull.p8	q0, d26, d0		@ I = A*B3
463	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
464	vand	d17, d17, d29
465	vext.8	d22, d6, d6, #4	@ B4
466	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
467	vand	d19, d19, d30
468	vmull.p8	q11, d26, d22		@ K = A*B4
469	veor	q10, q10, q0		@ N = I + J
470	veor	d16, d16, d17
471	veor	d18, d18, d19
472	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
473	vand	d21, d21, d31
474	vext.8	q8, q8, q8, #15
475	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
476	vmov.i64	d23, #0
477	vext.8	q9, q9, q9, #14
478	veor	d20, d20, d21
479	vmull.p8	q0, d26, d6		@ D = A*B
480	vext.8	q11, q11, q11, #12
481	vext.8	q10, q10, q10, #13
482	veor	q8, q8, q9
483	veor	q10, q10, q11
484	veor	q0, q0, q8
485	veor	q0, q0, q10
486	veor	d6,d6,d7	@ Karatsuba pre-processing
487	vext.8	d16, d28, d28, #1	@ A1
488	vmull.p8	q8, d16, d6		@ F = A1*B
489	vext.8	d2, d6, d6, #1	@ B1
490	vmull.p8	q1, d28, d2		@ E = A*B1
491	vext.8	d18, d28, d28, #2	@ A2
492	vmull.p8	q9, d18, d6		@ H = A2*B
493	vext.8	d22, d6, d6, #2	@ B2
494	vmull.p8	q11, d28, d22		@ G = A*B2
495	vext.8	d20, d28, d28, #3	@ A3
496	veor	q8, q8, q1		@ L = E + F
497	vmull.p8	q10, d20, d6		@ J = A3*B
498	vext.8	d2, d6, d6, #3	@ B3
499	veor	q9, q9, q11		@ M = G + H
500	vmull.p8	q1, d28, d2		@ I = A*B3
501	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
502	vand	d17, d17, d29
503	vext.8	d22, d6, d6, #4	@ B4
504	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
505	vand	d19, d19, d30
506	vmull.p8	q11, d28, d22		@ K = A*B4
507	veor	q10, q10, q1		@ N = I + J
508	veor	d16, d16, d17
509	veor	d18, d18, d19
510	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
511	vand	d21, d21, d31
512	vext.8	q8, q8, q8, #15
513	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
514	vmov.i64	d23, #0
515	vext.8	q9, q9, q9, #14
516	veor	d20, d20, d21
517	vmull.p8	q1, d28, d6		@ D = A*B
518	vext.8	q11, q11, q11, #12
519	vext.8	q10, q10, q10, #13
520	veor	q8, q8, q9
521	veor	q10, q10, q11
522	veor	q1, q1, q8
523	veor	q1, q1, q10
524	vext.8	d16, d27, d27, #1	@ A1
525	vmull.p8	q8, d16, d7		@ F = A1*B
526	vext.8	d4, d7, d7, #1	@ B1
527	vmull.p8	q2, d27, d4		@ E = A*B1
528	vext.8	d18, d27, d27, #2	@ A2
529	vmull.p8	q9, d18, d7		@ H = A2*B
530	vext.8	d22, d7, d7, #2	@ B2
531	vmull.p8	q11, d27, d22		@ G = A*B2
532	vext.8	d20, d27, d27, #3	@ A3
533	veor	q8, q8, q2		@ L = E + F
534	vmull.p8	q10, d20, d7		@ J = A3*B
535	vext.8	d4, d7, d7, #3	@ B3
536	veor	q9, q9, q11		@ M = G + H
537	vmull.p8	q2, d27, d4		@ I = A*B3
538	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
539	vand	d17, d17, d29
540	vext.8	d22, d7, d7, #4	@ B4
541	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
542	vand	d19, d19, d30
543	vmull.p8	q11, d27, d22		@ K = A*B4
544	veor	q10, q10, q2		@ N = I + J
545	veor	d16, d16, d17
546	veor	d18, d18, d19
547	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
548	vand	d21, d21, d31
549	vext.8	q8, q8, q8, #15
550	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
551	vmov.i64	d23, #0
552	vext.8	q9, q9, q9, #14
553	veor	d20, d20, d21
554	vmull.p8	q2, d27, d7		@ D = A*B
555	vext.8	q11, q11, q11, #12
556	vext.8	q10, q10, q10, #13
557	veor	q8, q8, q9
558	veor	q10, q10, q11
559	veor	q2, q2, q8
560	veor	q2, q2, q10
561	veor	q1,q1,q0		@ Karatsuba post-processing
562	veor	q1,q1,q2
563	veor	d1,d1,d2
564	veor	d4,d4,d3	@ Xh|Xl - 256-bit result
565
566	@ equivalent of reduction_avx from ghash-x86_64.pl
567	vshl.i64	q9,q0,#57		@ 1st phase
568	vshl.i64	q10,q0,#62
569	veor	q10,q10,q9		@
570	vshl.i64	q9,q0,#63
571	veor	q10, q10, q9		@
572	veor	d1,d1,d20	@
573	veor	d4,d4,d21
574
575	vshr.u64	q10,q0,#1		@ 2nd phase
576	veor	q2,q2,q0
577	veor	q0,q0,q10		@
578	vshr.u64	q10,q10,#6
579	vshr.u64	q0,q0,#1		@
580	veor	q0,q0,q2		@
581	veor	q0,q0,q10		@
582
583	subs	r3,#16
584	bne	Loop_neon
585
586#ifdef __ARMEL__
587	vrev64.8	q0,q0
588#endif
589	sub	r0,#16
590	vst1.64	d1,[r0]!		@ write out Xi
591	vst1.64	d0,[r0]
592
593	bx	lr					@ bx lr
594
595#endif
596.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
597.align	2
598.align	2
599#endif  // !OPENSSL_NO_ASM
600