1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from poly1305-armv4.pl. */
3#include "arm_arch.h"
4
5.text
6#if defined(__thumb2__)
7.syntax	unified
8.thumb
9#else
10.code	32
11#endif
12
13.globl	poly1305_emit
14.globl	poly1305_blocks
15.globl	poly1305_init
16.type	poly1305_init,%function
17.align	5
18poly1305_init:
19.Lpoly1305_init:
20	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
21
22	eor	r3,r3,r3
23	cmp	r1,#0
24	str	r3,[r0,#0]		@ zero hash value
25	str	r3,[r0,#4]
26	str	r3,[r0,#8]
27	str	r3,[r0,#12]
28	str	r3,[r0,#16]
29	str	r3,[r0,#36]		@ is_base2_26
30	add	r0,r0,#20
31
32#ifdef	__thumb2__
33	it	eq
34#endif
35	moveq	r0,#0
36	beq	.Lno_key
37
38#if	__ARM_MAX_ARCH__>=7
39	adr	r11,.Lpoly1305_init
40	ldr	r12,.LOPENSSL_armcap
41#endif
42	ldrb	r4,[r1,#0]
43	mov	r10,#0x0fffffff
44	ldrb	r5,[r1,#1]
45	and	r3,r10,#-4		@ 0x0ffffffc
46	ldrb	r6,[r1,#2]
47	ldrb	r7,[r1,#3]
48	orr	r4,r4,r5,lsl#8
49	ldrb	r5,[r1,#4]
50	orr	r4,r4,r6,lsl#16
51	ldrb	r6,[r1,#5]
52	orr	r4,r4,r7,lsl#24
53	ldrb	r7,[r1,#6]
54	and	r4,r4,r10
55
56#if	__ARM_MAX_ARCH__>=7
57	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
58# ifdef	__APPLE__
59	ldr	r12,[r12]
60# endif
61#endif
62	ldrb	r8,[r1,#7]
63	orr	r5,r5,r6,lsl#8
64	ldrb	r6,[r1,#8]
65	orr	r5,r5,r7,lsl#16
66	ldrb	r7,[r1,#9]
67	orr	r5,r5,r8,lsl#24
68	ldrb	r8,[r1,#10]
69	and	r5,r5,r3
70
71#if	__ARM_MAX_ARCH__>=7
72	tst	r12,#ARMV7_NEON		@ check for NEON
73# ifdef	__APPLE__
74	adr	r9,poly1305_blocks_neon
75	adr	r11,poly1305_blocks
76#  ifdef __thumb2__
77	it	ne
78#  endif
79	movne	r11,r9
80	adr	r12,poly1305_emit
81	adr	r10,poly1305_emit_neon
82#  ifdef __thumb2__
83	it	ne
84#  endif
85	movne	r12,r10
86# else
87#  ifdef __thumb2__
88	itete	eq
89#  endif
90	addeq	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
91	addne	r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
92	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
93	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
94# endif
95# ifdef	__thumb2__
96	orr	r12,r12,#1	@ thumb-ify address
97	orr	r11,r11,#1
98# endif
99#endif
100	ldrb	r9,[r1,#11]
101	orr	r6,r6,r7,lsl#8
102	ldrb	r7,[r1,#12]
103	orr	r6,r6,r8,lsl#16
104	ldrb	r8,[r1,#13]
105	orr	r6,r6,r9,lsl#24
106	ldrb	r9,[r1,#14]
107	and	r6,r6,r3
108
109	ldrb	r10,[r1,#15]
110	orr	r7,r7,r8,lsl#8
111	str	r4,[r0,#0]
112	orr	r7,r7,r9,lsl#16
113	str	r5,[r0,#4]
114	orr	r7,r7,r10,lsl#24
115	str	r6,[r0,#8]
116	and	r7,r7,r3
117	str	r7,[r0,#12]
118#if	__ARM_MAX_ARCH__>=7
119	stmia	r2,{r11,r12}		@ fill functions table
120	mov	r0,#1
121#else
122	mov	r0,#0
123#endif
124.Lno_key:
125	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
126#if	__ARM_ARCH__>=5
127	bx	lr				@ bx	lr
128#else
129	tst	lr,#1
130	moveq	pc,lr			@ be binary compatible with V4, yet
131.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
132#endif
133.size	poly1305_init,.-poly1305_init
134.type	poly1305_blocks,%function
135.align	5
136poly1305_blocks:
137.Lpoly1305_blocks:
138	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
139
140	ands	r2,r2,#-16
141	beq	.Lno_data
142
143	cmp	r3,#0
144	add	r2,r2,r1		@ end pointer
145	sub	sp,sp,#32
146
147	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12}		@ load context
148
149	str	r0,[sp,#12]		@ offload stuff
150	mov	lr,r1
151	str	r2,[sp,#16]
152	str	r10,[sp,#20]
153	str	r11,[sp,#24]
154	str	r12,[sp,#28]
155	b	.Loop
156
157.Loop:
158#if __ARM_ARCH__<7
159	ldrb	r0,[lr],#16		@ load input
160# ifdef	__thumb2__
161	it	hi
162# endif
163	addhi	r8,r8,#1		@ 1<<128
164	ldrb	r1,[lr,#-15]
165	ldrb	r2,[lr,#-14]
166	ldrb	r3,[lr,#-13]
167	orr	r1,r0,r1,lsl#8
168	ldrb	r0,[lr,#-12]
169	orr	r2,r1,r2,lsl#16
170	ldrb	r1,[lr,#-11]
171	orr	r3,r2,r3,lsl#24
172	ldrb	r2,[lr,#-10]
173	adds	r4,r4,r3		@ accumulate input
174
175	ldrb	r3,[lr,#-9]
176	orr	r1,r0,r1,lsl#8
177	ldrb	r0,[lr,#-8]
178	orr	r2,r1,r2,lsl#16
179	ldrb	r1,[lr,#-7]
180	orr	r3,r2,r3,lsl#24
181	ldrb	r2,[lr,#-6]
182	adcs	r5,r5,r3
183
184	ldrb	r3,[lr,#-5]
185	orr	r1,r0,r1,lsl#8
186	ldrb	r0,[lr,#-4]
187	orr	r2,r1,r2,lsl#16
188	ldrb	r1,[lr,#-3]
189	orr	r3,r2,r3,lsl#24
190	ldrb	r2,[lr,#-2]
191	adcs	r6,r6,r3
192
193	ldrb	r3,[lr,#-1]
194	orr	r1,r0,r1,lsl#8
195	str	lr,[sp,#8]		@ offload input pointer
196	orr	r2,r1,r2,lsl#16
197	add	r10,r10,r10,lsr#2
198	orr	r3,r2,r3,lsl#24
199#else
200	ldr	r0,[lr],#16		@ load input
201# ifdef	__thumb2__
202	it	hi
203# endif
204	addhi	r8,r8,#1		@ padbit
205	ldr	r1,[lr,#-12]
206	ldr	r2,[lr,#-8]
207	ldr	r3,[lr,#-4]
208# ifdef	__ARMEB__
209	rev	r0,r0
210	rev	r1,r1
211	rev	r2,r2
212	rev	r3,r3
213# endif
214	adds	r4,r4,r0		@ accumulate input
215	str	lr,[sp,#8]		@ offload input pointer
216	adcs	r5,r5,r1
217	add	r10,r10,r10,lsr#2
218	adcs	r6,r6,r2
219#endif
220	add	r11,r11,r11,lsr#2
221	adcs	r7,r7,r3
222	add	r12,r12,r12,lsr#2
223
224	umull	r2,r3,r5,r9
225	adc	r8,r8,#0
226	umull	r0,r1,r4,r9
227	umlal	r2,r3,r8,r10
228	umlal	r0,r1,r7,r10
229	ldr	r10,[sp,#20]		@ reload r10
230	umlal	r2,r3,r6,r12
231	umlal	r0,r1,r5,r12
232	umlal	r2,r3,r7,r11
233	umlal	r0,r1,r6,r11
234	umlal	r2,r3,r4,r10
235	str	r0,[sp,#0]		@ future r4
236	mul	r0,r11,r8
237	ldr	r11,[sp,#24]		@ reload r11
238	adds	r2,r2,r1		@ d1+=d0>>32
239	eor	r1,r1,r1
240	adc	lr,r3,#0		@ future r6
241	str	r2,[sp,#4]		@ future r5
242
243	mul	r2,r12,r8
244	eor	r3,r3,r3
245	umlal	r0,r1,r7,r12
246	ldr	r12,[sp,#28]		@ reload r12
247	umlal	r2,r3,r7,r9
248	umlal	r0,r1,r6,r9
249	umlal	r2,r3,r6,r10
250	umlal	r0,r1,r5,r10
251	umlal	r2,r3,r5,r11
252	umlal	r0,r1,r4,r11
253	umlal	r2,r3,r4,r12
254	ldr	r4,[sp,#0]
255	mul	r8,r9,r8
256	ldr	r5,[sp,#4]
257
258	adds	r6,lr,r0		@ d2+=d1>>32
259	ldr	lr,[sp,#8]		@ reload input pointer
260	adc	r1,r1,#0
261	adds	r7,r2,r1		@ d3+=d2>>32
262	ldr	r0,[sp,#16]		@ reload end pointer
263	adc	r3,r3,#0
264	add	r8,r8,r3		@ h4+=d3>>32
265
266	and	r1,r8,#-4
267	and	r8,r8,#3
268	add	r1,r1,r1,lsr#2		@ *=5
269	adds	r4,r4,r1
270	adcs	r5,r5,#0
271	adcs	r6,r6,#0
272	adcs	r7,r7,#0
273	adc	r8,r8,#0
274
275	cmp	r0,lr			@ done yet?
276	bhi	.Loop
277
278	ldr	r0,[sp,#12]
279	add	sp,sp,#32
280	stmia	r0,{r4,r5,r6,r7,r8}		@ store the result
281
282.Lno_data:
283#if	__ARM_ARCH__>=5
284	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
285#else
286	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
287	tst	lr,#1
288	moveq	pc,lr			@ be binary compatible with V4, yet
289.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
290#endif
291.size	poly1305_blocks,.-poly1305_blocks
292.type	poly1305_emit,%function
293.align	5
294poly1305_emit:
295.Lpoly1305_emit:
296	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
297.Lpoly1305_emit_enter:
298
299	ldmia	r0,{r3,r4,r5,r6,r7}
300	adds	r8,r3,#5		@ compare to modulus
301	adcs	r9,r4,#0
302	adcs	r10,r5,#0
303	adcs	r11,r6,#0
304	adc	r7,r7,#0
305	tst	r7,#4			@ did it carry/borrow?
306
307#ifdef	__thumb2__
308	it	ne
309#endif
310	movne	r3,r8
311	ldr	r8,[r2,#0]
312#ifdef	__thumb2__
313	it	ne
314#endif
315	movne	r4,r9
316	ldr	r9,[r2,#4]
317#ifdef	__thumb2__
318	it	ne
319#endif
320	movne	r5,r10
321	ldr	r10,[r2,#8]
322#ifdef	__thumb2__
323	it	ne
324#endif
325	movne	r6,r11
326	ldr	r11,[r2,#12]
327
328	adds	r3,r3,r8
329	adcs	r4,r4,r9
330	adcs	r5,r5,r10
331	adc	r6,r6,r11
332
333#if __ARM_ARCH__>=7
334# ifdef __ARMEB__
335	rev	r3,r3
336	rev	r4,r4
337	rev	r5,r5
338	rev	r6,r6
339# endif
340	str	r3,[r1,#0]
341	str	r4,[r1,#4]
342	str	r5,[r1,#8]
343	str	r6,[r1,#12]
344#else
345	strb	r3,[r1,#0]
346	mov	r3,r3,lsr#8
347	strb	r4,[r1,#4]
348	mov	r4,r4,lsr#8
349	strb	r5,[r1,#8]
350	mov	r5,r5,lsr#8
351	strb	r6,[r1,#12]
352	mov	r6,r6,lsr#8
353
354	strb	r3,[r1,#1]
355	mov	r3,r3,lsr#8
356	strb	r4,[r1,#5]
357	mov	r4,r4,lsr#8
358	strb	r5,[r1,#9]
359	mov	r5,r5,lsr#8
360	strb	r6,[r1,#13]
361	mov	r6,r6,lsr#8
362
363	strb	r3,[r1,#2]
364	mov	r3,r3,lsr#8
365	strb	r4,[r1,#6]
366	mov	r4,r4,lsr#8
367	strb	r5,[r1,#10]
368	mov	r5,r5,lsr#8
369	strb	r6,[r1,#14]
370	mov	r6,r6,lsr#8
371
372	strb	r3,[r1,#3]
373	strb	r4,[r1,#7]
374	strb	r5,[r1,#11]
375	strb	r6,[r1,#15]
376#endif
377	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
378#if	__ARM_ARCH__>=5
379	bx	lr				@ bx	lr
380#else
381	tst	lr,#1
382	moveq	pc,lr			@ be binary compatible with V4, yet
383.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
384#endif
385.size	poly1305_emit,.-poly1305_emit
386#if	__ARM_MAX_ARCH__>=7
387.fpu	neon
388
389.type	poly1305_init_neon,%function
390.align	5
391poly1305_init_neon:
392	ldr	r4,[r0,#20]		@ load key base 2^32
393	ldr	r5,[r0,#24]
394	ldr	r6,[r0,#28]
395	ldr	r7,[r0,#32]
396
397	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
398	mov	r3,r4,lsr#26
399	mov	r4,r5,lsr#20
400	orr	r3,r3,r5,lsl#6
401	mov	r5,r6,lsr#14
402	orr	r4,r4,r6,lsl#12
403	mov	r6,r7,lsr#8
404	orr	r5,r5,r7,lsl#18
405	and	r3,r3,#0x03ffffff
406	and	r4,r4,#0x03ffffff
407	and	r5,r5,#0x03ffffff
408
409	vdup.32	d0,r2			@ r^1 in both lanes
410	add	r2,r3,r3,lsl#2		@ *5
411	vdup.32	d1,r3
412	add	r3,r4,r4,lsl#2
413	vdup.32	d2,r2
414	vdup.32	d3,r4
415	add	r4,r5,r5,lsl#2
416	vdup.32	d4,r3
417	vdup.32	d5,r5
418	add	r5,r6,r6,lsl#2
419	vdup.32	d6,r4
420	vdup.32	d7,r6
421	vdup.32	d8,r5
422
423	mov	r5,#2		@ counter
424
425.Lsquare_neon:
426	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
427	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
428	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
429	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
430	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
431	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
432
433	vmull.u32	q5,d0,d0[1]
434	vmull.u32	q6,d1,d0[1]
435	vmull.u32	q7,d3,d0[1]
436	vmull.u32	q8,d5,d0[1]
437	vmull.u32	q9,d7,d0[1]
438
439	vmlal.u32	q5,d7,d2[1]
440	vmlal.u32	q6,d0,d1[1]
441	vmlal.u32	q7,d1,d1[1]
442	vmlal.u32	q8,d3,d1[1]
443	vmlal.u32	q9,d5,d1[1]
444
445	vmlal.u32	q5,d5,d4[1]
446	vmlal.u32	q6,d7,d4[1]
447	vmlal.u32	q8,d1,d3[1]
448	vmlal.u32	q7,d0,d3[1]
449	vmlal.u32	q9,d3,d3[1]
450
451	vmlal.u32	q5,d3,d6[1]
452	vmlal.u32	q8,d0,d5[1]
453	vmlal.u32	q6,d5,d6[1]
454	vmlal.u32	q7,d7,d6[1]
455	vmlal.u32	q9,d1,d5[1]
456
457	vmlal.u32	q8,d7,d8[1]
458	vmlal.u32	q5,d1,d8[1]
459	vmlal.u32	q6,d3,d8[1]
460	vmlal.u32	q7,d5,d8[1]
461	vmlal.u32	q9,d0,d7[1]
462
463	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
464	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
465	@ and P. Schwabe
466	@
467	@ H0>>+H1>>+H2>>+H3>>+H4
468	@ H3>>+H4>>*5+H0>>+H1
469	@
470	@ Trivia.
471	@
472	@ Result of multiplication of n-bit number by m-bit number is
473	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
474	@ m-bit number multiplied by 2^n is still n+m bits wide.
475	@
476	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
477	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
478	@ one is n+1 bits wide.
479	@
480	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
481	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
482	@ can be 27. However! In cases when their width exceeds 26 bits
483	@ they are limited by 2^26+2^6. This in turn means that *sum*
484	@ of the products with these values can still be viewed as sum
485	@ of 52-bit numbers as long as the amount of addends is not a
486	@ power of 2. For example,
487	@
488	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
489	@
490	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
491	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
492	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
493	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
494	@ which is less than 32 * (2^52) or 2^57. And when processing
495	@ data we are looking at triple as many addends...
496	@
497	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
498	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
499	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
500	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
501	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
502	@ This means that result of reduction have to be compressed upon
503	@ loop wrap-around. This can be done in the process of reduction
504	@ to minimize amount of instructions [as well as amount of
505	@ 128-bit instructions, which benefits low-end processors], but
506	@ one has to watch for H2 (which is narrower than H0) and 5*H4
507	@ not being wider than 58 bits, so that result of right shift
508	@ by 26 bits fits in 32 bits. This is also useful on x86,
509	@ because it allows to use paddd in place for paddq, which
510	@ benefits Atom, where paddq is ridiculously slow.
511
512	vshr.u64	q15,q8,#26
513	vmovn.i64	d16,q8
514	vshr.u64	q4,q5,#26
515	vmovn.i64	d10,q5
516	vadd.i64	q9,q9,q15		@ h3 -> h4
517	vbic.i32	d16,#0xfc000000	@ &=0x03ffffff
518	vadd.i64	q6,q6,q4		@ h0 -> h1
519	vbic.i32	d10,#0xfc000000
520
521	vshrn.u64	d30,q9,#26
522	vmovn.i64	d18,q9
523	vshr.u64	q4,q6,#26
524	vmovn.i64	d12,q6
525	vadd.i64	q7,q7,q4		@ h1 -> h2
526	vbic.i32	d18,#0xfc000000
527	vbic.i32	d12,#0xfc000000
528
529	vadd.i32	d10,d10,d30
530	vshl.u32	d30,d30,#2
531	vshrn.u64	d8,q7,#26
532	vmovn.i64	d14,q7
533	vadd.i32	d10,d10,d30	@ h4 -> h0
534	vadd.i32	d16,d16,d8	@ h2 -> h3
535	vbic.i32	d14,#0xfc000000
536
537	vshr.u32	d30,d10,#26
538	vbic.i32	d10,#0xfc000000
539	vshr.u32	d8,d16,#26
540	vbic.i32	d16,#0xfc000000
541	vadd.i32	d12,d12,d30	@ h0 -> h1
542	vadd.i32	d18,d18,d8	@ h3 -> h4
543
544	subs	r5,r5,#1
545	beq	.Lsquare_break_neon
546
547	add	r6,r0,#(48+0*9*4)
548	add	r7,r0,#(48+1*9*4)
549
550	vtrn.32	d0,d10		@ r^2:r^1
551	vtrn.32	d3,d14
552	vtrn.32	d5,d16
553	vtrn.32	d1,d12
554	vtrn.32	d7,d18
555
556	vshl.u32	d4,d3,#2		@ *5
557	vshl.u32	d6,d5,#2
558	vshl.u32	d2,d1,#2
559	vshl.u32	d8,d7,#2
560	vadd.i32	d4,d4,d3
561	vadd.i32	d2,d2,d1
562	vadd.i32	d6,d6,d5
563	vadd.i32	d8,d8,d7
564
565	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
566	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
567	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
568	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
569	vst1.32	{d8[0]},[r6,:32]
570	vst1.32	{d8[1]},[r7,:32]
571
572	b	.Lsquare_neon
573
574.align	4
575.Lsquare_break_neon:
576	add	r6,r0,#(48+2*4*9)
577	add	r7,r0,#(48+3*4*9)
578
579	vmov	d0,d10		@ r^4:r^3
580	vshl.u32	d2,d12,#2		@ *5
581	vmov	d1,d12
582	vshl.u32	d4,d14,#2
583	vmov	d3,d14
584	vshl.u32	d6,d16,#2
585	vmov	d5,d16
586	vshl.u32	d8,d18,#2
587	vmov	d7,d18
588	vadd.i32	d2,d2,d12
589	vadd.i32	d4,d4,d14
590	vadd.i32	d6,d6,d16
591	vadd.i32	d8,d8,d18
592
593	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
594	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
595	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
596	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
597	vst1.32	{d8[0]},[r6]
598	vst1.32	{d8[1]},[r7]
599
600	bx	lr				@ bx	lr
601.size	poly1305_init_neon,.-poly1305_init_neon
602
603.type	poly1305_blocks_neon,%function
604.align	5
605poly1305_blocks_neon:
606.Lpoly1305_blocks_neon:
607	ldr	ip,[r0,#36]		@ is_base2_26
608	ands	r2,r2,#-16
609	beq	.Lno_data_neon
610
611	cmp	r2,#64
612	bhs	.Lenter_neon
613	tst	ip,ip			@ is_base2_26?
614	beq	.Lpoly1305_blocks
615
616.Lenter_neon:
617	stmdb	sp!,{r4,r5,r6,r7}
618	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
619
620	tst	ip,ip			@ is_base2_26?
621	bne	.Lbase2_26_neon
622
623	stmdb	sp!,{r1,r2,r3,lr}
624	bl	poly1305_init_neon
625
626	ldr	r4,[r0,#0]		@ load hash value base 2^32
627	ldr	r5,[r0,#4]
628	ldr	r6,[r0,#8]
629	ldr	r7,[r0,#12]
630	ldr	ip,[r0,#16]
631
632	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
633	mov	r3,r4,lsr#26
634	veor	d10,d10,d10
635	mov	r4,r5,lsr#20
636	orr	r3,r3,r5,lsl#6
637	veor	d12,d12,d12
638	mov	r5,r6,lsr#14
639	orr	r4,r4,r6,lsl#12
640	veor	d14,d14,d14
641	mov	r6,r7,lsr#8
642	orr	r5,r5,r7,lsl#18
643	veor	d16,d16,d16
644	and	r3,r3,#0x03ffffff
645	orr	r6,r6,ip,lsl#24
646	veor	d18,d18,d18
647	and	r4,r4,#0x03ffffff
648	mov	r1,#1
649	and	r5,r5,#0x03ffffff
650	str	r1,[r0,#36]		@ is_base2_26
651
652	vmov.32	d10[0],r2
653	vmov.32	d12[0],r3
654	vmov.32	d14[0],r4
655	vmov.32	d16[0],r5
656	vmov.32	d18[0],r6
657	adr	r5,.Lzeros
658
659	ldmia	sp!,{r1,r2,r3,lr}
660	b	.Lbase2_32_neon
661
662.align	4
663.Lbase2_26_neon:
664	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
665	@ load hash value
666
667	veor	d10,d10,d10
668	veor	d12,d12,d12
669	veor	d14,d14,d14
670	veor	d16,d16,d16
671	veor	d18,d18,d18
672	vld4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
673	adr	r5,.Lzeros
674	vld1.32	{d18[0]},[r0]
675	sub	r0,r0,#16		@ rewind
676
677.Lbase2_32_neon:
678	add	r4,r1,#32
679	mov	r3,r3,lsl#24
680	tst	r2,#31
681	beq	.Leven
682
683	vld4.32	{d20[0],d22[0],d24[0],d26[0]},[r1]!
684	vmov.32	d28[0],r3
685	sub	r2,r2,#16
686	add	r4,r1,#32
687
688# ifdef	__ARMEB__
689	vrev32.8	q10,q10
690	vrev32.8	q13,q13
691	vrev32.8	q11,q11
692	vrev32.8	q12,q12
693# endif
694	vsri.u32	d28,d26,#8	@ base 2^32 -> base 2^26
695	vshl.u32	d26,d26,#18
696
697	vsri.u32	d26,d24,#14
698	vshl.u32	d24,d24,#12
699	vadd.i32	d29,d28,d18	@ add hash value and move to #hi
700
701	vbic.i32	d26,#0xfc000000
702	vsri.u32	d24,d22,#20
703	vshl.u32	d22,d22,#6
704
705	vbic.i32	d24,#0xfc000000
706	vsri.u32	d22,d20,#26
707	vadd.i32	d27,d26,d16
708
709	vbic.i32	d20,#0xfc000000
710	vbic.i32	d22,#0xfc000000
711	vadd.i32	d25,d24,d14
712
713	vadd.i32	d21,d20,d10
714	vadd.i32	d23,d22,d12
715
716	mov	r7,r5
717	add	r6,r0,#48
718
719	cmp	r2,r2
720	b	.Long_tail
721
722.align	4
723.Leven:
724	subs	r2,r2,#64
725	it	lo
726	movlo	r4,r5
727
728	vmov.i32	q14,#1<<24		@ padbit, yes, always
729	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
730	add	r1,r1,#64
731	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
732	add	r4,r4,#64
733	itt	hi
734	addhi	r7,r0,#(48+1*9*4)
735	addhi	r6,r0,#(48+3*9*4)
736
737# ifdef	__ARMEB__
738	vrev32.8	q10,q10
739	vrev32.8	q13,q13
740	vrev32.8	q11,q11
741	vrev32.8	q12,q12
742# endif
743	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
744	vshl.u32	q13,q13,#18
745
746	vsri.u32	q13,q12,#14
747	vshl.u32	q12,q12,#12
748
749	vbic.i32	q13,#0xfc000000
750	vsri.u32	q12,q11,#20
751	vshl.u32	q11,q11,#6
752
753	vbic.i32	q12,#0xfc000000
754	vsri.u32	q11,q10,#26
755
756	vbic.i32	q10,#0xfc000000
757	vbic.i32	q11,#0xfc000000
758
759	bls	.Lskip_loop
760
761	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^2
762	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
763	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
764	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
765	b	.Loop_neon
766
767.align	5
768.Loop_neon:
769	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
770	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
771	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
772	@   ___________________/
773	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
774	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
775	@   ___________________/ ____________________/
776	@
777	@ Note that we start with inp[2:3]*r^2. This is because it
778	@ doesn't depend on reduction in previous iteration.
779	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
780	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
781	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
782	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
783	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
784	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
785
786	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
787	@ inp[2:3]*r^2
788
789	vadd.i32	d24,d24,d14	@ accumulate inp[0:1]
790	vmull.u32	q7,d25,d0[1]
791	vadd.i32	d20,d20,d10
792	vmull.u32	q5,d21,d0[1]
793	vadd.i32	d26,d26,d16
794	vmull.u32	q8,d27,d0[1]
795	vmlal.u32	q7,d23,d1[1]
796	vadd.i32	d22,d22,d12
797	vmull.u32	q6,d23,d0[1]
798
799	vadd.i32	d28,d28,d18
800	vmull.u32	q9,d29,d0[1]
801	subs	r2,r2,#64
802	vmlal.u32	q5,d29,d2[1]
803	it	lo
804	movlo	r4,r5
805	vmlal.u32	q8,d25,d1[1]
806	vld1.32	d8[1],[r7,:32]
807	vmlal.u32	q6,d21,d1[1]
808	vmlal.u32	q9,d27,d1[1]
809
810	vmlal.u32	q5,d27,d4[1]
811	vmlal.u32	q8,d23,d3[1]
812	vmlal.u32	q9,d25,d3[1]
813	vmlal.u32	q6,d29,d4[1]
814	vmlal.u32	q7,d21,d3[1]
815
816	vmlal.u32	q8,d21,d5[1]
817	vmlal.u32	q5,d25,d6[1]
818	vmlal.u32	q9,d23,d5[1]
819	vmlal.u32	q6,d27,d6[1]
820	vmlal.u32	q7,d29,d6[1]
821
822	vmlal.u32	q8,d29,d8[1]
823	vmlal.u32	q5,d23,d8[1]
824	vmlal.u32	q9,d21,d7[1]
825	vmlal.u32	q6,d25,d8[1]
826	vmlal.u32	q7,d27,d8[1]
827
828	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
829	add	r4,r4,#64
830
831	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
832	@ (hash+inp[0:1])*r^4 and accumulate
833
834	vmlal.u32	q8,d26,d0[0]
835	vmlal.u32	q5,d20,d0[0]
836	vmlal.u32	q9,d28,d0[0]
837	vmlal.u32	q6,d22,d0[0]
838	vmlal.u32	q7,d24,d0[0]
839	vld1.32	d8[0],[r6,:32]
840
841	vmlal.u32	q8,d24,d1[0]
842	vmlal.u32	q5,d28,d2[0]
843	vmlal.u32	q9,d26,d1[0]
844	vmlal.u32	q6,d20,d1[0]
845	vmlal.u32	q7,d22,d1[0]
846
847	vmlal.u32	q8,d22,d3[0]
848	vmlal.u32	q5,d26,d4[0]
849	vmlal.u32	q9,d24,d3[0]
850	vmlal.u32	q6,d28,d4[0]
851	vmlal.u32	q7,d20,d3[0]
852
853	vmlal.u32	q8,d20,d5[0]
854	vmlal.u32	q5,d24,d6[0]
855	vmlal.u32	q9,d22,d5[0]
856	vmlal.u32	q6,d26,d6[0]
857	vmlal.u32	q8,d28,d8[0]
858
859	vmlal.u32	q7,d28,d6[0]
860	vmlal.u32	q5,d22,d8[0]
861	vmlal.u32	q9,d20,d7[0]
862	vmov.i32	q14,#1<<24		@ padbit, yes, always
863	vmlal.u32	q6,d24,d8[0]
864	vmlal.u32	q7,d26,d8[0]
865
866	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
867	add	r1,r1,#64
868# ifdef	__ARMEB__
869	vrev32.8	q10,q10
870	vrev32.8	q11,q11
871	vrev32.8	q12,q12
872	vrev32.8	q13,q13
873# endif
874
875	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
876	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
877	@ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
878
879	vshr.u64	q15,q8,#26
880	vmovn.i64	d16,q8
881	vshr.u64	q4,q5,#26
882	vmovn.i64	d10,q5
883	vadd.i64	q9,q9,q15		@ h3 -> h4
884	vbic.i32	d16,#0xfc000000
885	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
886	vadd.i64	q6,q6,q4		@ h0 -> h1
887	vshl.u32	q13,q13,#18
888	vbic.i32	d10,#0xfc000000
889
890	vshrn.u64	d30,q9,#26
891	vmovn.i64	d18,q9
892	vshr.u64	q4,q6,#26
893	vmovn.i64	d12,q6
894	vadd.i64	q7,q7,q4		@ h1 -> h2
895	vsri.u32	q13,q12,#14
896	vbic.i32	d18,#0xfc000000
897	vshl.u32	q12,q12,#12
898	vbic.i32	d12,#0xfc000000
899
900	vadd.i32	d10,d10,d30
901	vshl.u32	d30,d30,#2
902	vbic.i32	q13,#0xfc000000
903	vshrn.u64	d8,q7,#26
904	vmovn.i64	d14,q7
905	vaddl.u32	q5,d10,d30	@ h4 -> h0 [widen for a sec]
906	vsri.u32	q12,q11,#20
907	vadd.i32	d16,d16,d8	@ h2 -> h3
908	vshl.u32	q11,q11,#6
909	vbic.i32	d14,#0xfc000000
910	vbic.i32	q12,#0xfc000000
911
912	vshrn.u64	d30,q5,#26		@ re-narrow
913	vmovn.i64	d10,q5
914	vsri.u32	q11,q10,#26
915	vbic.i32	q10,#0xfc000000
916	vshr.u32	d8,d16,#26
917	vbic.i32	d16,#0xfc000000
918	vbic.i32	d10,#0xfc000000
919	vadd.i32	d12,d12,d30	@ h0 -> h1
920	vadd.i32	d18,d18,d8	@ h3 -> h4
921	vbic.i32	q11,#0xfc000000
922
923	bhi	.Loop_neon
924
925.Lskip_loop:
926	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
927	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
928
929	add	r7,r0,#(48+0*9*4)
930	add	r6,r0,#(48+1*9*4)
931	adds	r2,r2,#32
932	it	ne
933	movne	r2,#0
934	bne	.Long_tail
935
936	vadd.i32	d25,d24,d14	@ add hash value and move to #hi
937	vadd.i32	d21,d20,d10
938	vadd.i32	d27,d26,d16
939	vadd.i32	d23,d22,d12
940	vadd.i32	d29,d28,d18
941
942.Long_tail:
943	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^1
944	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^2
945
946	vadd.i32	d24,d24,d14	@ can be redundant
947	vmull.u32	q7,d25,d0
948	vadd.i32	d20,d20,d10
949	vmull.u32	q5,d21,d0
950	vadd.i32	d26,d26,d16
951	vmull.u32	q8,d27,d0
952	vadd.i32	d22,d22,d12
953	vmull.u32	q6,d23,d0
954	vadd.i32	d28,d28,d18
955	vmull.u32	q9,d29,d0
956
957	vmlal.u32	q5,d29,d2
958	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
959	vmlal.u32	q8,d25,d1
960	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
961	vmlal.u32	q6,d21,d1
962	vmlal.u32	q9,d27,d1
963	vmlal.u32	q7,d23,d1
964
965	vmlal.u32	q8,d23,d3
966	vld1.32	d8[1],[r7,:32]
967	vmlal.u32	q5,d27,d4
968	vld1.32	d8[0],[r6,:32]
969	vmlal.u32	q9,d25,d3
970	vmlal.u32	q6,d29,d4
971	vmlal.u32	q7,d21,d3
972
973	vmlal.u32	q8,d21,d5
974	it	ne
975	addne	r7,r0,#(48+2*9*4)
976	vmlal.u32	q5,d25,d6
977	it	ne
978	addne	r6,r0,#(48+3*9*4)
979	vmlal.u32	q9,d23,d5
980	vmlal.u32	q6,d27,d6
981	vmlal.u32	q7,d29,d6
982
983	vmlal.u32	q8,d29,d8
984	vorn	q0,q0,q0	@ all-ones, can be redundant
985	vmlal.u32	q5,d23,d8
986	vshr.u64	q0,q0,#38
987	vmlal.u32	q9,d21,d7
988	vmlal.u32	q6,d25,d8
989	vmlal.u32	q7,d27,d8
990
991	beq	.Lshort_tail
992
993	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
994	@ (hash+inp[0:1])*r^4:r^3 and accumulate
995
996	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^3
997	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
998
999	vmlal.u32	q7,d24,d0
1000	vmlal.u32	q5,d20,d0
1001	vmlal.u32	q8,d26,d0
1002	vmlal.u32	q6,d22,d0
1003	vmlal.u32	q9,d28,d0
1004
1005	vmlal.u32	q5,d28,d2
1006	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
1007	vmlal.u32	q8,d24,d1
1008	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
1009	vmlal.u32	q6,d20,d1
1010	vmlal.u32	q9,d26,d1
1011	vmlal.u32	q7,d22,d1
1012
1013	vmlal.u32	q8,d22,d3
1014	vld1.32	d8[1],[r7,:32]
1015	vmlal.u32	q5,d26,d4
1016	vld1.32	d8[0],[r6,:32]
1017	vmlal.u32	q9,d24,d3
1018	vmlal.u32	q6,d28,d4
1019	vmlal.u32	q7,d20,d3
1020
1021	vmlal.u32	q8,d20,d5
1022	vmlal.u32	q5,d24,d6
1023	vmlal.u32	q9,d22,d5
1024	vmlal.u32	q6,d26,d6
1025	vmlal.u32	q7,d28,d6
1026
1027	vmlal.u32	q8,d28,d8
1028	vorn	q0,q0,q0	@ all-ones
1029	vmlal.u32	q5,d22,d8
1030	vshr.u64	q0,q0,#38
1031	vmlal.u32	q9,d20,d7
1032	vmlal.u32	q6,d24,d8
1033	vmlal.u32	q7,d26,d8
1034
1035.Lshort_tail:
1036	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1037	@ horizontal addition
1038
1039	vadd.i64	d16,d16,d17
1040	vadd.i64	d10,d10,d11
1041	vadd.i64	d18,d18,d19
1042	vadd.i64	d12,d12,d13
1043	vadd.i64	d14,d14,d15
1044
1045	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1046	@ lazy reduction, but without narrowing
1047
1048	vshr.u64	q15,q8,#26
1049	vand.i64	q8,q8,q0
1050	vshr.u64	q4,q5,#26
1051	vand.i64	q5,q5,q0
1052	vadd.i64	q9,q9,q15		@ h3 -> h4
1053	vadd.i64	q6,q6,q4		@ h0 -> h1
1054
1055	vshr.u64	q15,q9,#26
1056	vand.i64	q9,q9,q0
1057	vshr.u64	q4,q6,#26
1058	vand.i64	q6,q6,q0
1059	vadd.i64	q7,q7,q4		@ h1 -> h2
1060
1061	vadd.i64	q5,q5,q15
1062	vshl.u64	q15,q15,#2
1063	vshr.u64	q4,q7,#26
1064	vand.i64	q7,q7,q0
1065	vadd.i64	q5,q5,q15		@ h4 -> h0
1066	vadd.i64	q8,q8,q4		@ h2 -> h3
1067
1068	vshr.u64	q15,q5,#26
1069	vand.i64	q5,q5,q0
1070	vshr.u64	q4,q8,#26
1071	vand.i64	q8,q8,q0
1072	vadd.i64	q6,q6,q15		@ h0 -> h1
1073	vadd.i64	q9,q9,q4		@ h3 -> h4
1074
1075	cmp	r2,#0
1076	bne	.Leven
1077
1078	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1079	@ store hash value
1080
1081	vst4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
1082	vst1.32	{d18[0]},[r0]
1083
1084	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}			@ epilogue
1085	ldmia	sp!,{r4,r5,r6,r7}
1086.Lno_data_neon:
1087	bx	lr					@ bx	lr
1088.size	poly1305_blocks_neon,.-poly1305_blocks_neon
1089
1090.type	poly1305_emit_neon,%function
1091.align	5
1092poly1305_emit_neon:
1093.Lpoly1305_emit_neon:
1094	ldr	ip,[r0,#36]		@ is_base2_26
1095
1096	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1097
1098	tst	ip,ip
1099	beq	.Lpoly1305_emit_enter
1100
1101	ldmia	r0,{r3,r4,r5,r6,r7}
1102	eor	r8,r8,r8
1103
1104	adds	r3,r3,r4,lsl#26	@ base 2^26 -> base 2^32
1105	mov	r4,r4,lsr#6
1106	adcs	r4,r4,r5,lsl#20
1107	mov	r5,r5,lsr#12
1108	adcs	r5,r5,r6,lsl#14
1109	mov	r6,r6,lsr#18
1110	adcs	r6,r6,r7,lsl#8
1111	adc	r7,r8,r7,lsr#24	@ can be partially reduced ...
1112
1113	and	r8,r7,#-4		@ ... so reduce
1114	and	r7,r6,#3
1115	add	r8,r8,r8,lsr#2	@ *= 5
1116	adds	r3,r3,r8
1117	adcs	r4,r4,#0
1118	adcs	r5,r5,#0
1119	adcs	r6,r6,#0
1120	adc	r7,r7,#0
1121
1122	adds	r8,r3,#5		@ compare to modulus
1123	adcs	r9,r4,#0
1124	adcs	r10,r5,#0
1125	adcs	r11,r6,#0
1126	adc	r7,r7,#0
1127	tst	r7,#4			@ did it carry/borrow?
1128
1129	it	ne
1130	movne	r3,r8
1131	ldr	r8,[r2,#0]
1132	it	ne
1133	movne	r4,r9
1134	ldr	r9,[r2,#4]
1135	it	ne
1136	movne	r5,r10
1137	ldr	r10,[r2,#8]
1138	it	ne
1139	movne	r6,r11
1140	ldr	r11,[r2,#12]
1141
1142	adds	r3,r3,r8		@ accumulate nonce
1143	adcs	r4,r4,r9
1144	adcs	r5,r5,r10
1145	adc	r6,r6,r11
1146
1147# ifdef __ARMEB__
1148	rev	r3,r3
1149	rev	r4,r4
1150	rev	r5,r5
1151	rev	r6,r6
1152# endif
1153	str	r3,[r1,#0]		@ store the result
1154	str	r4,[r1,#4]
1155	str	r5,[r1,#8]
1156	str	r6,[r1,#12]
1157
1158	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1159	bx	lr				@ bx	lr
1160.size	poly1305_emit_neon,.-poly1305_emit_neon
1161
1162.align	5
1163.Lzeros:
1164.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1165.LOPENSSL_armcap:
1166.word	OPENSSL_armcap_P-.Lpoly1305_init
1167#endif
1168.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1169.align	2
1170.align	2
1171#if	__ARM_MAX_ARCH__>=7
1172.comm	OPENSSL_armcap_P,4,4
1173#endif
1174