1/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
2#include "arm_arch.h"
3
4.text
5
6// forward "declarations" are required for Apple
7
8.hidden	OPENSSL_armcap_P
9.globl	poly1305_init
10.hidden	poly1305_init
11.globl	poly1305_blocks
12.hidden	poly1305_blocks
13.globl	poly1305_emit
14.hidden	poly1305_emit
15
16.type	poly1305_init,%function
17.align	5
18poly1305_init:
19	cmp	x1,xzr
20	stp	xzr,xzr,[x0]		// zero hash value
21	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
22
23	csel	x0,xzr,x0,eq
24	b.eq	.Lno_key
25
26#ifdef	__ILP32__
27	ldrsw	x11,.LOPENSSL_armcap_P
28#else
29	ldr	x11,.LOPENSSL_armcap_P
30#endif
31	adr	x10,.LOPENSSL_armcap_P
32
33	ldp	x7,x8,[x1]		// load key
34	mov	x9,#0xfffffffc0fffffff
35	movk	x9,#0x0fff,lsl#48
36	ldr	w17,[x10,x11]
37#ifdef	__ARMEB__
38	rev	x7,x7			// flip bytes
39	rev	x8,x8
40#endif
41	and	x7,x7,x9		// &=0ffffffc0fffffff
42	and	x9,x9,#-4
43	and	x8,x8,x9		// &=0ffffffc0ffffffc
44	stp	x7,x8,[x0,#32]	// save key value
45
46	tst	w17,#ARMV7_NEON
47
48	adr	x12,poly1305_blocks
49	adr	x7,poly1305_blocks_neon
50	adr	x13,poly1305_emit
51	adr	x8,poly1305_emit_neon
52
53	csel	x12,x12,x7,eq
54	csel	x13,x13,x8,eq
55
56#ifdef	__ILP32__
57	stp	w12,w13,[x2]
58#else
59	stp	x12,x13,[x2]
60#endif
61
62	mov	x0,#1
63.Lno_key:
64	ret
65.size	poly1305_init,.-poly1305_init
66
67.type	poly1305_blocks,%function
68.align	5
69poly1305_blocks:
70	ands	x2,x2,#-16
71	b.eq	.Lno_data
72
73	ldp	x4,x5,[x0]		// load hash value
74	ldp	x7,x8,[x0,#32]	// load key value
75	ldr	x6,[x0,#16]
76	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
77	b	.Loop
78
79.align	5
80.Loop:
81	ldp	x10,x11,[x1],#16	// load input
82	sub	x2,x2,#16
83#ifdef	__ARMEB__
84	rev	x10,x10
85	rev	x11,x11
86#endif
87	adds	x4,x4,x10		// accumulate input
88	adcs	x5,x5,x11
89
90	mul	x12,x4,x7		// h0*r0
91	adc	x6,x6,x3
92	umulh	x13,x4,x7
93
94	mul	x10,x5,x9		// h1*5*r1
95	umulh	x11,x5,x9
96
97	adds	x12,x12,x10
98	mul	x10,x4,x8		// h0*r1
99	adc	x13,x13,x11
100	umulh	x14,x4,x8
101
102	adds	x13,x13,x10
103	mul	x10,x5,x7		// h1*r0
104	adc	x14,x14,xzr
105	umulh	x11,x5,x7
106
107	adds	x13,x13,x10
108	mul	x10,x6,x9		// h2*5*r1
109	adc	x14,x14,x11
110	mul	x11,x6,x7		// h2*r0
111
112	adds	x13,x13,x10
113	adc	x14,x14,x11
114
115	and	x10,x14,#-4		// final reduction
116	and	x6,x14,#3
117	add	x10,x10,x14,lsr#2
118	adds	x4,x12,x10
119	adcs	x5,x13,xzr
120	adc	x6,x6,xzr
121
122	cbnz	x2,.Loop
123
124	stp	x4,x5,[x0]		// store hash value
125	str	x6,[x0,#16]
126
127.Lno_data:
128	ret
129.size	poly1305_blocks,.-poly1305_blocks
130
131.type	poly1305_emit,%function
132.align	5
133poly1305_emit:
134	ldp	x4,x5,[x0]		// load hash base 2^64
135	ldr	x6,[x0,#16]
136	ldp	x10,x11,[x2]	// load nonce
137
138	adds	x12,x4,#5		// compare to modulus
139	adcs	x13,x5,xzr
140	adc	x14,x6,xzr
141
142	tst	x14,#-4			// see if it's carried/borrowed
143
144	csel	x4,x4,x12,eq
145	csel	x5,x5,x13,eq
146
147#ifdef	__ARMEB__
148	ror	x10,x10,#32		// flip nonce words
149	ror	x11,x11,#32
150#endif
151	adds	x4,x4,x10		// accumulate nonce
152	adc	x5,x5,x11
153#ifdef	__ARMEB__
154	rev	x4,x4			// flip output bytes
155	rev	x5,x5
156#endif
157	stp	x4,x5,[x1]		// write result
158
159	ret
160.size	poly1305_emit,.-poly1305_emit
161.type	poly1305_mult,%function
162.align	5
163poly1305_mult:
164	mul	x12,x4,x7		// h0*r0
165	umulh	x13,x4,x7
166
167	mul	x10,x5,x9		// h1*5*r1
168	umulh	x11,x5,x9
169
170	adds	x12,x12,x10
171	mul	x10,x4,x8		// h0*r1
172	adc	x13,x13,x11
173	umulh	x14,x4,x8
174
175	adds	x13,x13,x10
176	mul	x10,x5,x7		// h1*r0
177	adc	x14,x14,xzr
178	umulh	x11,x5,x7
179
180	adds	x13,x13,x10
181	mul	x10,x6,x9		// h2*5*r1
182	adc	x14,x14,x11
183	mul	x11,x6,x7		// h2*r0
184
185	adds	x13,x13,x10
186	adc	x14,x14,x11
187
188	and	x10,x14,#-4		// final reduction
189	and	x6,x14,#3
190	add	x10,x10,x14,lsr#2
191	adds	x4,x12,x10
192	adcs	x5,x13,xzr
193	adc	x6,x6,xzr
194
195	ret
196.size	poly1305_mult,.-poly1305_mult
197
198.type	poly1305_splat,%function
199.align	5
200poly1305_splat:
201	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
202	ubfx	x13,x4,#26,#26
203	extr	x14,x5,x4,#52
204	and	x14,x14,#0x03ffffff
205	ubfx	x15,x5,#14,#26
206	extr	x16,x6,x5,#40
207
208	str	w12,[x0,#16*0]	// r0
209	add	w12,w13,w13,lsl#2	// r1*5
210	str	w13,[x0,#16*1]	// r1
211	add	w13,w14,w14,lsl#2	// r2*5
212	str	w12,[x0,#16*2]	// s1
213	str	w14,[x0,#16*3]	// r2
214	add	w14,w15,w15,lsl#2	// r3*5
215	str	w13,[x0,#16*4]	// s2
216	str	w15,[x0,#16*5]	// r3
217	add	w15,w16,w16,lsl#2	// r4*5
218	str	w14,[x0,#16*6]	// s3
219	str	w16,[x0,#16*7]	// r4
220	str	w15,[x0,#16*8]	// s4
221
222	ret
223.size	poly1305_splat,.-poly1305_splat
224
225.type	poly1305_blocks_neon,%function
226.align	5
227poly1305_blocks_neon:
228	ldr	x17,[x0,#24]
229	cmp	x2,#128
230	b.hs	.Lblocks_neon
231	cbz	x17,poly1305_blocks
232
233.Lblocks_neon:
234.inst	0xd503233f		// paciasp
235	stp	x29,x30,[sp,#-80]!
236	add	x29,sp,#0
237
238	ands	x2,x2,#-16
239	b.eq	.Lno_data_neon
240
241	cbz	x17,.Lbase2_64_neon
242
243	ldp	w10,w11,[x0]		// load hash value base 2^26
244	ldp	w12,w13,[x0,#8]
245	ldr	w14,[x0,#16]
246
247	tst	x2,#31
248	b.eq	.Leven_neon
249
250	ldp	x7,x8,[x0,#32]	// load key value
251
252	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
253	lsr	x5,x12,#12
254	adds	x4,x4,x12,lsl#52
255	add	x5,x5,x13,lsl#14
256	adc	x5,x5,xzr
257	lsr	x6,x14,#24
258	adds	x5,x5,x14,lsl#40
259	adc	x14,x6,xzr		// can be partially reduced...
260
261	ldp	x12,x13,[x1],#16	// load input
262	sub	x2,x2,#16
263	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
264
265	and	x10,x14,#-4		// ... so reduce
266	and	x6,x14,#3
267	add	x10,x10,x14,lsr#2
268	adds	x4,x4,x10
269	adcs	x5,x5,xzr
270	adc	x6,x6,xzr
271
272#ifdef	__ARMEB__
273	rev	x12,x12
274	rev	x13,x13
275#endif
276	adds	x4,x4,x12		// accumulate input
277	adcs	x5,x5,x13
278	adc	x6,x6,x3
279
280	bl	poly1305_mult
281	ldr	x30,[sp,#8]
282
283	cbz	x3,.Lstore_base2_64_neon
284
285	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
286	ubfx	x11,x4,#26,#26
287	extr	x12,x5,x4,#52
288	and	x12,x12,#0x03ffffff
289	ubfx	x13,x5,#14,#26
290	extr	x14,x6,x5,#40
291
292	cbnz	x2,.Leven_neon
293
294	stp	w10,w11,[x0]		// store hash value base 2^26
295	stp	w12,w13,[x0,#8]
296	str	w14,[x0,#16]
297	b	.Lno_data_neon
298
299.align	4
300.Lstore_base2_64_neon:
301	stp	x4,x5,[x0]		// store hash value base 2^64
302	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
303	b	.Lno_data_neon
304
305.align	4
306.Lbase2_64_neon:
307	ldp	x7,x8,[x0,#32]	// load key value
308
309	ldp	x4,x5,[x0]		// load hash value base 2^64
310	ldr	x6,[x0,#16]
311
312	tst	x2,#31
313	b.eq	.Linit_neon
314
315	ldp	x12,x13,[x1],#16	// load input
316	sub	x2,x2,#16
317	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
318#ifdef	__ARMEB__
319	rev	x12,x12
320	rev	x13,x13
321#endif
322	adds	x4,x4,x12		// accumulate input
323	adcs	x5,x5,x13
324	adc	x6,x6,x3
325
326	bl	poly1305_mult
327
328.Linit_neon:
329	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
330	ubfx	x11,x4,#26,#26
331	extr	x12,x5,x4,#52
332	and	x12,x12,#0x03ffffff
333	ubfx	x13,x5,#14,#26
334	extr	x14,x6,x5,#40
335
336	stp	d8,d9,[sp,#16]		// meet ABI requirements
337	stp	d10,d11,[sp,#32]
338	stp	d12,d13,[sp,#48]
339	stp	d14,d15,[sp,#64]
340
341	fmov	d24,x10
342	fmov	d25,x11
343	fmov	d26,x12
344	fmov	d27,x13
345	fmov	d28,x14
346
347	////////////////////////////////// initialize r^n table
348	mov	x4,x7			// r^1
349	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
350	mov	x5,x8
351	mov	x6,xzr
352	add	x0,x0,#48+12
353	bl	poly1305_splat
354
355	bl	poly1305_mult		// r^2
356	sub	x0,x0,#4
357	bl	poly1305_splat
358
359	bl	poly1305_mult		// r^3
360	sub	x0,x0,#4
361	bl	poly1305_splat
362
363	bl	poly1305_mult		// r^4
364	sub	x0,x0,#4
365	bl	poly1305_splat
366	ldr	x30,[sp,#8]
367
368	add	x16,x1,#32
369	adr	x17,.Lzeros
370	subs	x2,x2,#64
371	csel	x16,x17,x16,lo
372
373	mov	x4,#1
374	str	x4,[x0,#-24]		// set is_base2_26
375	sub	x0,x0,#48		// restore original x0
376	b	.Ldo_neon
377
378.align	4
379.Leven_neon:
380	add	x16,x1,#32
381	adr	x17,.Lzeros
382	subs	x2,x2,#64
383	csel	x16,x17,x16,lo
384
385	stp	d8,d9,[sp,#16]		// meet ABI requirements
386	stp	d10,d11,[sp,#32]
387	stp	d12,d13,[sp,#48]
388	stp	d14,d15,[sp,#64]
389
390	fmov	d24,x10
391	fmov	d25,x11
392	fmov	d26,x12
393	fmov	d27,x13
394	fmov	d28,x14
395
396.Ldo_neon:
397	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
398	ldp	x9,x13,[x16],#48
399
400	lsl	x3,x3,#24
401	add	x15,x0,#48
402
403#ifdef	__ARMEB__
404	rev	x8,x8
405	rev	x12,x12
406	rev	x9,x9
407	rev	x13,x13
408#endif
409	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
410	and	x5,x9,#0x03ffffff
411	ubfx	x6,x8,#26,#26
412	ubfx	x7,x9,#26,#26
413	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
414	extr	x8,x12,x8,#52
415	extr	x9,x13,x9,#52
416	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
417	fmov	d14,x4
418	and	x8,x8,#0x03ffffff
419	and	x9,x9,#0x03ffffff
420	ubfx	x10,x12,#14,#26
421	ubfx	x11,x13,#14,#26
422	add	x12,x3,x12,lsr#40
423	add	x13,x3,x13,lsr#40
424	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
425	fmov	d15,x6
426	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
427	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
428	fmov	d16,x8
429	fmov	d17,x10
430	fmov	d18,x12
431
432	ldp	x8,x12,[x1],#16	// inp[0:1]
433	ldp	x9,x13,[x1],#48
434
435	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
436	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
437	ld1	{v8.4s},[x15]
438
439#ifdef	__ARMEB__
440	rev	x8,x8
441	rev	x12,x12
442	rev	x9,x9
443	rev	x13,x13
444#endif
445	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
446	and	x5,x9,#0x03ffffff
447	ubfx	x6,x8,#26,#26
448	ubfx	x7,x9,#26,#26
449	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
450	extr	x8,x12,x8,#52
451	extr	x9,x13,x9,#52
452	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
453	fmov	d9,x4
454	and	x8,x8,#0x03ffffff
455	and	x9,x9,#0x03ffffff
456	ubfx	x10,x12,#14,#26
457	ubfx	x11,x13,#14,#26
458	add	x12,x3,x12,lsr#40
459	add	x13,x3,x13,lsr#40
460	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
461	fmov	d10,x6
462	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
463	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
464	movi	v31.2d,#-1
465	fmov	d11,x8
466	fmov	d12,x10
467	fmov	d13,x12
468	ushr	v31.2d,v31.2d,#38
469
470	b.ls	.Lskip_loop
471
472.align	4
473.Loop_neon:
474	////////////////////////////////////////////////////////////////
475	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
476	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
477	//   ___________________/
478	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
479	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
480	//   ___________________/ ____________________/
481	//
482	// Note that we start with inp[2:3]*r^2. This is because it
483	// doesn't depend on reduction in previous iteration.
484	////////////////////////////////////////////////////////////////
485	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
486	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
487	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
488	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
489	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
490
491	subs	x2,x2,#64
492	umull	v23.2d,v14.2s,v7.s[2]
493	csel	x16,x17,x16,lo
494	umull	v22.2d,v14.2s,v5.s[2]
495	umull	v21.2d,v14.2s,v3.s[2]
496	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
497	umull	v20.2d,v14.2s,v1.s[2]
498	ldp	x9,x13,[x16],#48
499	umull	v19.2d,v14.2s,v0.s[2]
500#ifdef	__ARMEB__
501	rev	x8,x8
502	rev	x12,x12
503	rev	x9,x9
504	rev	x13,x13
505#endif
506
507	umlal	v23.2d,v15.2s,v5.s[2]
508	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
509	umlal	v22.2d,v15.2s,v3.s[2]
510	and	x5,x9,#0x03ffffff
511	umlal	v21.2d,v15.2s,v1.s[2]
512	ubfx	x6,x8,#26,#26
513	umlal	v20.2d,v15.2s,v0.s[2]
514	ubfx	x7,x9,#26,#26
515	umlal	v19.2d,v15.2s,v8.s[2]
516	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
517
518	umlal	v23.2d,v16.2s,v3.s[2]
519	extr	x8,x12,x8,#52
520	umlal	v22.2d,v16.2s,v1.s[2]
521	extr	x9,x13,x9,#52
522	umlal	v21.2d,v16.2s,v0.s[2]
523	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
524	umlal	v20.2d,v16.2s,v8.s[2]
525	fmov	d14,x4
526	umlal	v19.2d,v16.2s,v6.s[2]
527	and	x8,x8,#0x03ffffff
528
529	umlal	v23.2d,v17.2s,v1.s[2]
530	and	x9,x9,#0x03ffffff
531	umlal	v22.2d,v17.2s,v0.s[2]
532	ubfx	x10,x12,#14,#26
533	umlal	v21.2d,v17.2s,v8.s[2]
534	ubfx	x11,x13,#14,#26
535	umlal	v20.2d,v17.2s,v6.s[2]
536	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
537	umlal	v19.2d,v17.2s,v4.s[2]
538	fmov	d15,x6
539
540	add	v11.2s,v11.2s,v26.2s
541	add	x12,x3,x12,lsr#40
542	umlal	v23.2d,v18.2s,v0.s[2]
543	add	x13,x3,x13,lsr#40
544	umlal	v22.2d,v18.2s,v8.s[2]
545	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
546	umlal	v21.2d,v18.2s,v6.s[2]
547	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
548	umlal	v20.2d,v18.2s,v4.s[2]
549	fmov	d16,x8
550	umlal	v19.2d,v18.2s,v2.s[2]
551	fmov	d17,x10
552
553	////////////////////////////////////////////////////////////////
554	// (hash+inp[0:1])*r^4 and accumulate
555
556	add	v9.2s,v9.2s,v24.2s
557	fmov	d18,x12
558	umlal	v22.2d,v11.2s,v1.s[0]
559	ldp	x8,x12,[x1],#16	// inp[0:1]
560	umlal	v19.2d,v11.2s,v6.s[0]
561	ldp	x9,x13,[x1],#48
562	umlal	v23.2d,v11.2s,v3.s[0]
563	umlal	v20.2d,v11.2s,v8.s[0]
564	umlal	v21.2d,v11.2s,v0.s[0]
565#ifdef	__ARMEB__
566	rev	x8,x8
567	rev	x12,x12
568	rev	x9,x9
569	rev	x13,x13
570#endif
571
572	add	v10.2s,v10.2s,v25.2s
573	umlal	v22.2d,v9.2s,v5.s[0]
574	umlal	v23.2d,v9.2s,v7.s[0]
575	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
576	umlal	v21.2d,v9.2s,v3.s[0]
577	and	x5,x9,#0x03ffffff
578	umlal	v19.2d,v9.2s,v0.s[0]
579	ubfx	x6,x8,#26,#26
580	umlal	v20.2d,v9.2s,v1.s[0]
581	ubfx	x7,x9,#26,#26
582
583	add	v12.2s,v12.2s,v27.2s
584	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
585	umlal	v22.2d,v10.2s,v3.s[0]
586	extr	x8,x12,x8,#52
587	umlal	v23.2d,v10.2s,v5.s[0]
588	extr	x9,x13,x9,#52
589	umlal	v19.2d,v10.2s,v8.s[0]
590	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
591	umlal	v21.2d,v10.2s,v1.s[0]
592	fmov	d9,x4
593	umlal	v20.2d,v10.2s,v0.s[0]
594	and	x8,x8,#0x03ffffff
595
596	add	v13.2s,v13.2s,v28.2s
597	and	x9,x9,#0x03ffffff
598	umlal	v22.2d,v12.2s,v0.s[0]
599	ubfx	x10,x12,#14,#26
600	umlal	v19.2d,v12.2s,v4.s[0]
601	ubfx	x11,x13,#14,#26
602	umlal	v23.2d,v12.2s,v1.s[0]
603	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
604	umlal	v20.2d,v12.2s,v6.s[0]
605	fmov	d10,x6
606	umlal	v21.2d,v12.2s,v8.s[0]
607	add	x12,x3,x12,lsr#40
608
609	umlal	v22.2d,v13.2s,v8.s[0]
610	add	x13,x3,x13,lsr#40
611	umlal	v19.2d,v13.2s,v2.s[0]
612	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
613	umlal	v23.2d,v13.2s,v0.s[0]
614	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
615	umlal	v20.2d,v13.2s,v4.s[0]
616	fmov	d11,x8
617	umlal	v21.2d,v13.2s,v6.s[0]
618	fmov	d12,x10
619	fmov	d13,x12
620
621	/////////////////////////////////////////////////////////////////
622	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
623	// and P. Schwabe
624	//
625	// [see discussion in poly1305-armv4 module]
626
627	ushr	v29.2d,v22.2d,#26
628	xtn	v27.2s,v22.2d
629	ushr	v30.2d,v19.2d,#26
630	and	v19.16b,v19.16b,v31.16b
631	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
632	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
633	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
634
635	ushr	v29.2d,v23.2d,#26
636	xtn	v28.2s,v23.2d
637	ushr	v30.2d,v20.2d,#26
638	xtn	v25.2s,v20.2d
639	bic	v28.2s,#0xfc,lsl#24
640	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
641
642	add	v19.2d,v19.2d,v29.2d
643	shl	v29.2d,v29.2d,#2
644	shrn	v30.2s,v21.2d,#26
645	xtn	v26.2s,v21.2d
646	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
647	bic	v25.2s,#0xfc,lsl#24
648	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
649	bic	v26.2s,#0xfc,lsl#24
650
651	shrn	v29.2s,v19.2d,#26
652	xtn	v24.2s,v19.2d
653	ushr	v30.2s,v27.2s,#26
654	bic	v27.2s,#0xfc,lsl#24
655	bic	v24.2s,#0xfc,lsl#24
656	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
657	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
658
659	b.hi	.Loop_neon
660
661.Lskip_loop:
662	dup	v16.2d,v16.d[0]
663	add	v11.2s,v11.2s,v26.2s
664
665	////////////////////////////////////////////////////////////////
666	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
667
668	adds	x2,x2,#32
669	b.ne	.Long_tail
670
671	dup	v16.2d,v11.d[0]
672	add	v14.2s,v9.2s,v24.2s
673	add	v17.2s,v12.2s,v27.2s
674	add	v15.2s,v10.2s,v25.2s
675	add	v18.2s,v13.2s,v28.2s
676
677.Long_tail:
678	dup	v14.2d,v14.d[0]
679	umull2	v19.2d,v16.4s,v6.4s
680	umull2	v22.2d,v16.4s,v1.4s
681	umull2	v23.2d,v16.4s,v3.4s
682	umull2	v21.2d,v16.4s,v0.4s
683	umull2	v20.2d,v16.4s,v8.4s
684
685	dup	v15.2d,v15.d[0]
686	umlal2	v19.2d,v14.4s,v0.4s
687	umlal2	v21.2d,v14.4s,v3.4s
688	umlal2	v22.2d,v14.4s,v5.4s
689	umlal2	v23.2d,v14.4s,v7.4s
690	umlal2	v20.2d,v14.4s,v1.4s
691
692	dup	v17.2d,v17.d[0]
693	umlal2	v19.2d,v15.4s,v8.4s
694	umlal2	v22.2d,v15.4s,v3.4s
695	umlal2	v21.2d,v15.4s,v1.4s
696	umlal2	v23.2d,v15.4s,v5.4s
697	umlal2	v20.2d,v15.4s,v0.4s
698
699	dup	v18.2d,v18.d[0]
700	umlal2	v22.2d,v17.4s,v0.4s
701	umlal2	v23.2d,v17.4s,v1.4s
702	umlal2	v19.2d,v17.4s,v4.4s
703	umlal2	v20.2d,v17.4s,v6.4s
704	umlal2	v21.2d,v17.4s,v8.4s
705
706	umlal2	v22.2d,v18.4s,v8.4s
707	umlal2	v19.2d,v18.4s,v2.4s
708	umlal2	v23.2d,v18.4s,v0.4s
709	umlal2	v20.2d,v18.4s,v4.4s
710	umlal2	v21.2d,v18.4s,v6.4s
711
712	b.eq	.Lshort_tail
713
714	////////////////////////////////////////////////////////////////
715	// (hash+inp[0:1])*r^4:r^3 and accumulate
716
717	add	v9.2s,v9.2s,v24.2s
718	umlal	v22.2d,v11.2s,v1.2s
719	umlal	v19.2d,v11.2s,v6.2s
720	umlal	v23.2d,v11.2s,v3.2s
721	umlal	v20.2d,v11.2s,v8.2s
722	umlal	v21.2d,v11.2s,v0.2s
723
724	add	v10.2s,v10.2s,v25.2s
725	umlal	v22.2d,v9.2s,v5.2s
726	umlal	v19.2d,v9.2s,v0.2s
727	umlal	v23.2d,v9.2s,v7.2s
728	umlal	v20.2d,v9.2s,v1.2s
729	umlal	v21.2d,v9.2s,v3.2s
730
731	add	v12.2s,v12.2s,v27.2s
732	umlal	v22.2d,v10.2s,v3.2s
733	umlal	v19.2d,v10.2s,v8.2s
734	umlal	v23.2d,v10.2s,v5.2s
735	umlal	v20.2d,v10.2s,v0.2s
736	umlal	v21.2d,v10.2s,v1.2s
737
738	add	v13.2s,v13.2s,v28.2s
739	umlal	v22.2d,v12.2s,v0.2s
740	umlal	v19.2d,v12.2s,v4.2s
741	umlal	v23.2d,v12.2s,v1.2s
742	umlal	v20.2d,v12.2s,v6.2s
743	umlal	v21.2d,v12.2s,v8.2s
744
745	umlal	v22.2d,v13.2s,v8.2s
746	umlal	v19.2d,v13.2s,v2.2s
747	umlal	v23.2d,v13.2s,v0.2s
748	umlal	v20.2d,v13.2s,v4.2s
749	umlal	v21.2d,v13.2s,v6.2s
750
751.Lshort_tail:
752	////////////////////////////////////////////////////////////////
753	// horizontal add
754
755	addp	v22.2d,v22.2d,v22.2d
756	ldp	d8,d9,[sp,#16]		// meet ABI requirements
757	addp	v19.2d,v19.2d,v19.2d
758	ldp	d10,d11,[sp,#32]
759	addp	v23.2d,v23.2d,v23.2d
760	ldp	d12,d13,[sp,#48]
761	addp	v20.2d,v20.2d,v20.2d
762	ldp	d14,d15,[sp,#64]
763	addp	v21.2d,v21.2d,v21.2d
764
765	////////////////////////////////////////////////////////////////
766	// lazy reduction, but without narrowing
767
768	ushr	v29.2d,v22.2d,#26
769	and	v22.16b,v22.16b,v31.16b
770	ushr	v30.2d,v19.2d,#26
771	and	v19.16b,v19.16b,v31.16b
772
773	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
774	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
775
776	ushr	v29.2d,v23.2d,#26
777	and	v23.16b,v23.16b,v31.16b
778	ushr	v30.2d,v20.2d,#26
779	and	v20.16b,v20.16b,v31.16b
780	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
781
782	add	v19.2d,v19.2d,v29.2d
783	shl	v29.2d,v29.2d,#2
784	ushr	v30.2d,v21.2d,#26
785	and	v21.16b,v21.16b,v31.16b
786	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
787	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
788
789	ushr	v29.2d,v19.2d,#26
790	and	v19.16b,v19.16b,v31.16b
791	ushr	v30.2d,v22.2d,#26
792	and	v22.16b,v22.16b,v31.16b
793	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
794	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
795
796	////////////////////////////////////////////////////////////////
797	// write the result, can be partially reduced
798
799	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
800	st1	{v23.s}[0],[x0]
801
802.Lno_data_neon:
803	ldr	x29,[sp],#80
804.inst	0xd50323bf		// autiasp
805	ret
806.size	poly1305_blocks_neon,.-poly1305_blocks_neon
807
808.type	poly1305_emit_neon,%function
809.align	5
810poly1305_emit_neon:
811	ldr	x17,[x0,#24]
812	cbz	x17,poly1305_emit
813
814	ldp	w10,w11,[x0]		// load hash value base 2^26
815	ldp	w12,w13,[x0,#8]
816	ldr	w14,[x0,#16]
817
818	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
819	lsr	x5,x12,#12
820	adds	x4,x4,x12,lsl#52
821	add	x5,x5,x13,lsl#14
822	adc	x5,x5,xzr
823	lsr	x6,x14,#24
824	adds	x5,x5,x14,lsl#40
825	adc	x6,x6,xzr		// can be partially reduced...
826
827	ldp	x10,x11,[x2]	// load nonce
828
829	and	x12,x6,#-4		// ... so reduce
830	add	x12,x12,x6,lsr#2
831	and	x6,x6,#3
832	adds	x4,x4,x12
833	adcs	x5,x5,xzr
834	adc	x6,x6,xzr
835
836	adds	x12,x4,#5		// compare to modulus
837	adcs	x13,x5,xzr
838	adc	x14,x6,xzr
839
840	tst	x14,#-4			// see if it's carried/borrowed
841
842	csel	x4,x4,x12,eq
843	csel	x5,x5,x13,eq
844
845#ifdef	__ARMEB__
846	ror	x10,x10,#32		// flip nonce words
847	ror	x11,x11,#32
848#endif
849	adds	x4,x4,x10		// accumulate nonce
850	adc	x5,x5,x11
851#ifdef	__ARMEB__
852	rev	x4,x4			// flip output bytes
853	rev	x5,x5
854#endif
855	stp	x4,x5,[x1]		// write result
856
857	ret
858.size	poly1305_emit_neon,.-poly1305_emit_neon
859
860.align	5
861.Lzeros:
862.long	0,0,0,0,0,0,0,0
863.LOPENSSL_armcap_P:
864#ifdef	__ILP32__
865.long	OPENSSL_armcap_P-.
866#else
867.quad	OPENSSL_armcap_P-.
868#endif
869.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
870.align	2
871.align	2
872