1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
2bc3d5698SJohn Baldwin#include "arm_arch.h"
3bc3d5698SJohn Baldwin
4bc3d5698SJohn Baldwin.text
5bc3d5698SJohn Baldwin
6bc3d5698SJohn Baldwin// forward "declarations" are required for Apple
7bc3d5698SJohn Baldwin
8c3c73b4fSJung-uk Kim.hidden	OPENSSL_armcap_P
9bc3d5698SJohn Baldwin.globl	poly1305_init
10c3c73b4fSJung-uk Kim.hidden	poly1305_init
11c3c73b4fSJung-uk Kim.globl	poly1305_blocks
12c3c73b4fSJung-uk Kim.hidden	poly1305_blocks
13c3c73b4fSJung-uk Kim.globl	poly1305_emit
14c3c73b4fSJung-uk Kim.hidden	poly1305_emit
15c3c73b4fSJung-uk Kim
16bc3d5698SJohn Baldwin.type	poly1305_init,%function
17bc3d5698SJohn Baldwin.align	5
18bc3d5698SJohn Baldwinpoly1305_init:
19bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
20bc3d5698SJohn Baldwin	cmp	x1,xzr
21bc3d5698SJohn Baldwin	stp	xzr,xzr,[x0]		// zero hash value
22bc3d5698SJohn Baldwin	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
23bc3d5698SJohn Baldwin
24bc3d5698SJohn Baldwin	csel	x0,xzr,x0,eq
25bc3d5698SJohn Baldwin	b.eq	.Lno_key
26bc3d5698SJohn Baldwin
27c0855eaaSJohn Baldwin	adrp	x17,OPENSSL_armcap_P
28c0855eaaSJohn Baldwin	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
29bc3d5698SJohn Baldwin
30bc3d5698SJohn Baldwin	ldp	x7,x8,[x1]		// load key
31bc3d5698SJohn Baldwin	mov	x9,#0xfffffffc0fffffff
32bc3d5698SJohn Baldwin	movk	x9,#0x0fff,lsl#48
33575878a5SEd Maste#ifdef	__AARCH64EB__
34bc3d5698SJohn Baldwin	rev	x7,x7			// flip bytes
35bc3d5698SJohn Baldwin	rev	x8,x8
36bc3d5698SJohn Baldwin#endif
37bc3d5698SJohn Baldwin	and	x7,x7,x9		// &=0ffffffc0fffffff
38bc3d5698SJohn Baldwin	and	x9,x9,#-4
39bc3d5698SJohn Baldwin	and	x8,x8,x9		// &=0ffffffc0ffffffc
40bc3d5698SJohn Baldwin	stp	x7,x8,[x0,#32]	// save key value
41bc3d5698SJohn Baldwin
42bc3d5698SJohn Baldwin	tst	w17,#ARMV7_NEON
43bc3d5698SJohn Baldwin
44c0855eaaSJohn Baldwin	adr	x12,.Lpoly1305_blocks
45c0855eaaSJohn Baldwin	adr	x7,.Lpoly1305_blocks_neon
46c0855eaaSJohn Baldwin	adr	x13,.Lpoly1305_emit
47c0855eaaSJohn Baldwin	adr	x8,.Lpoly1305_emit_neon
48bc3d5698SJohn Baldwin
49bc3d5698SJohn Baldwin	csel	x12,x12,x7,eq
50bc3d5698SJohn Baldwin	csel	x13,x13,x8,eq
51bc3d5698SJohn Baldwin
52bc3d5698SJohn Baldwin#ifdef	__ILP32__
53bc3d5698SJohn Baldwin	stp	w12,w13,[x2]
54bc3d5698SJohn Baldwin#else
55bc3d5698SJohn Baldwin	stp	x12,x13,[x2]
56bc3d5698SJohn Baldwin#endif
57bc3d5698SJohn Baldwin
58bc3d5698SJohn Baldwin	mov	x0,#1
59bc3d5698SJohn Baldwin.Lno_key:
60bc3d5698SJohn Baldwin	ret
61bc3d5698SJohn Baldwin.size	poly1305_init,.-poly1305_init
62bc3d5698SJohn Baldwin
63bc3d5698SJohn Baldwin.type	poly1305_blocks,%function
64bc3d5698SJohn Baldwin.align	5
65bc3d5698SJohn Baldwinpoly1305_blocks:
66c0855eaaSJohn Baldwin.Lpoly1305_blocks:
67bd9588bcSAndrew Turner	// The symbol .Lpoly1305_blocks is not a .globl symbol
68bd9588bcSAndrew Turner	// but a pointer to it is returned by poly1305_init
69bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
70bc3d5698SJohn Baldwin	ands	x2,x2,#-16
71bc3d5698SJohn Baldwin	b.eq	.Lno_data
72bc3d5698SJohn Baldwin
73bc3d5698SJohn Baldwin	ldp	x4,x5,[x0]		// load hash value
74bc3d5698SJohn Baldwin	ldp	x7,x8,[x0,#32]	// load key value
75bc3d5698SJohn Baldwin	ldr	x6,[x0,#16]
76bc3d5698SJohn Baldwin	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
77bc3d5698SJohn Baldwin	b	.Loop
78bc3d5698SJohn Baldwin
79bc3d5698SJohn Baldwin.align	5
80bc3d5698SJohn Baldwin.Loop:
81bc3d5698SJohn Baldwin	ldp	x10,x11,[x1],#16	// load input
82bc3d5698SJohn Baldwin	sub	x2,x2,#16
83575878a5SEd Maste#ifdef	__AARCH64EB__
84bc3d5698SJohn Baldwin	rev	x10,x10
85bc3d5698SJohn Baldwin	rev	x11,x11
86bc3d5698SJohn Baldwin#endif
87bc3d5698SJohn Baldwin	adds	x4,x4,x10		// accumulate input
88bc3d5698SJohn Baldwin	adcs	x5,x5,x11
89bc3d5698SJohn Baldwin
90bc3d5698SJohn Baldwin	mul	x12,x4,x7		// h0*r0
91bc3d5698SJohn Baldwin	adc	x6,x6,x3
92bc3d5698SJohn Baldwin	umulh	x13,x4,x7
93bc3d5698SJohn Baldwin
94bc3d5698SJohn Baldwin	mul	x10,x5,x9		// h1*5*r1
95bc3d5698SJohn Baldwin	umulh	x11,x5,x9
96bc3d5698SJohn Baldwin
97bc3d5698SJohn Baldwin	adds	x12,x12,x10
98bc3d5698SJohn Baldwin	mul	x10,x4,x8		// h0*r1
99bc3d5698SJohn Baldwin	adc	x13,x13,x11
100bc3d5698SJohn Baldwin	umulh	x14,x4,x8
101bc3d5698SJohn Baldwin
102bc3d5698SJohn Baldwin	adds	x13,x13,x10
103bc3d5698SJohn Baldwin	mul	x10,x5,x7		// h1*r0
104bc3d5698SJohn Baldwin	adc	x14,x14,xzr
105bc3d5698SJohn Baldwin	umulh	x11,x5,x7
106bc3d5698SJohn Baldwin
107bc3d5698SJohn Baldwin	adds	x13,x13,x10
108bc3d5698SJohn Baldwin	mul	x10,x6,x9		// h2*5*r1
109bc3d5698SJohn Baldwin	adc	x14,x14,x11
110bc3d5698SJohn Baldwin	mul	x11,x6,x7		// h2*r0
111bc3d5698SJohn Baldwin
112bc3d5698SJohn Baldwin	adds	x13,x13,x10
113bc3d5698SJohn Baldwin	adc	x14,x14,x11
114bc3d5698SJohn Baldwin
115bc3d5698SJohn Baldwin	and	x10,x14,#-4		// final reduction
116bc3d5698SJohn Baldwin	and	x6,x14,#3
117bc3d5698SJohn Baldwin	add	x10,x10,x14,lsr#2
118bc3d5698SJohn Baldwin	adds	x4,x12,x10
119bc3d5698SJohn Baldwin	adcs	x5,x13,xzr
120bc3d5698SJohn Baldwin	adc	x6,x6,xzr
121bc3d5698SJohn Baldwin
122bc3d5698SJohn Baldwin	cbnz	x2,.Loop
123bc3d5698SJohn Baldwin
124bc3d5698SJohn Baldwin	stp	x4,x5,[x0]		// store hash value
125bc3d5698SJohn Baldwin	str	x6,[x0,#16]
126bc3d5698SJohn Baldwin
127bc3d5698SJohn Baldwin.Lno_data:
128bc3d5698SJohn Baldwin	ret
129bc3d5698SJohn Baldwin.size	poly1305_blocks,.-poly1305_blocks
130bc3d5698SJohn Baldwin
131bc3d5698SJohn Baldwin.type	poly1305_emit,%function
132bc3d5698SJohn Baldwin.align	5
133bc3d5698SJohn Baldwinpoly1305_emit:
134c0855eaaSJohn Baldwin.Lpoly1305_emit:
135bd9588bcSAndrew Turner	// The symbol .poly1305_emit is not a .globl symbol
136bd9588bcSAndrew Turner	// but a pointer to it is returned by poly1305_init
137bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
138bc3d5698SJohn Baldwin	ldp	x4,x5,[x0]		// load hash base 2^64
139bc3d5698SJohn Baldwin	ldr	x6,[x0,#16]
140bc3d5698SJohn Baldwin	ldp	x10,x11,[x2]	// load nonce
141bc3d5698SJohn Baldwin
142bc3d5698SJohn Baldwin	adds	x12,x4,#5		// compare to modulus
143bc3d5698SJohn Baldwin	adcs	x13,x5,xzr
144bc3d5698SJohn Baldwin	adc	x14,x6,xzr
145bc3d5698SJohn Baldwin
146bc3d5698SJohn Baldwin	tst	x14,#-4			// see if it's carried/borrowed
147bc3d5698SJohn Baldwin
148bc3d5698SJohn Baldwin	csel	x4,x4,x12,eq
149bc3d5698SJohn Baldwin	csel	x5,x5,x13,eq
150bc3d5698SJohn Baldwin
151575878a5SEd Maste#ifdef	__AARCH64EB__
152bc3d5698SJohn Baldwin	ror	x10,x10,#32		// flip nonce words
153bc3d5698SJohn Baldwin	ror	x11,x11,#32
154bc3d5698SJohn Baldwin#endif
155bc3d5698SJohn Baldwin	adds	x4,x4,x10		// accumulate nonce
156bc3d5698SJohn Baldwin	adc	x5,x5,x11
157575878a5SEd Maste#ifdef	__AARCH64EB__
158bc3d5698SJohn Baldwin	rev	x4,x4			// flip output bytes
159bc3d5698SJohn Baldwin	rev	x5,x5
160bc3d5698SJohn Baldwin#endif
161bc3d5698SJohn Baldwin	stp	x4,x5,[x1]		// write result
162bc3d5698SJohn Baldwin
163bc3d5698SJohn Baldwin	ret
164bc3d5698SJohn Baldwin.size	poly1305_emit,.-poly1305_emit
165bc3d5698SJohn Baldwin.type	poly1305_mult,%function
166bc3d5698SJohn Baldwin.align	5
167bc3d5698SJohn Baldwinpoly1305_mult:
168bc3d5698SJohn Baldwin	mul	x12,x4,x7		// h0*r0
169bc3d5698SJohn Baldwin	umulh	x13,x4,x7
170bc3d5698SJohn Baldwin
171bc3d5698SJohn Baldwin	mul	x10,x5,x9		// h1*5*r1
172bc3d5698SJohn Baldwin	umulh	x11,x5,x9
173bc3d5698SJohn Baldwin
174bc3d5698SJohn Baldwin	adds	x12,x12,x10
175bc3d5698SJohn Baldwin	mul	x10,x4,x8		// h0*r1
176bc3d5698SJohn Baldwin	adc	x13,x13,x11
177bc3d5698SJohn Baldwin	umulh	x14,x4,x8
178bc3d5698SJohn Baldwin
179bc3d5698SJohn Baldwin	adds	x13,x13,x10
180bc3d5698SJohn Baldwin	mul	x10,x5,x7		// h1*r0
181bc3d5698SJohn Baldwin	adc	x14,x14,xzr
182bc3d5698SJohn Baldwin	umulh	x11,x5,x7
183bc3d5698SJohn Baldwin
184bc3d5698SJohn Baldwin	adds	x13,x13,x10
185bc3d5698SJohn Baldwin	mul	x10,x6,x9		// h2*5*r1
186bc3d5698SJohn Baldwin	adc	x14,x14,x11
187bc3d5698SJohn Baldwin	mul	x11,x6,x7		// h2*r0
188bc3d5698SJohn Baldwin
189bc3d5698SJohn Baldwin	adds	x13,x13,x10
190bc3d5698SJohn Baldwin	adc	x14,x14,x11
191bc3d5698SJohn Baldwin
192bc3d5698SJohn Baldwin	and	x10,x14,#-4		// final reduction
193bc3d5698SJohn Baldwin	and	x6,x14,#3
194bc3d5698SJohn Baldwin	add	x10,x10,x14,lsr#2
195bc3d5698SJohn Baldwin	adds	x4,x12,x10
196bc3d5698SJohn Baldwin	adcs	x5,x13,xzr
197bc3d5698SJohn Baldwin	adc	x6,x6,xzr
198bc3d5698SJohn Baldwin
199bc3d5698SJohn Baldwin	ret
200bc3d5698SJohn Baldwin.size	poly1305_mult,.-poly1305_mult
201bc3d5698SJohn Baldwin
202bc3d5698SJohn Baldwin.type	poly1305_splat,%function
203bc3d5698SJohn Baldwin.align	5
204bc3d5698SJohn Baldwinpoly1305_splat:
205bc3d5698SJohn Baldwin	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
206bc3d5698SJohn Baldwin	ubfx	x13,x4,#26,#26
207bc3d5698SJohn Baldwin	extr	x14,x5,x4,#52
208bc3d5698SJohn Baldwin	and	x14,x14,#0x03ffffff
209bc3d5698SJohn Baldwin	ubfx	x15,x5,#14,#26
210bc3d5698SJohn Baldwin	extr	x16,x6,x5,#40
211bc3d5698SJohn Baldwin
212bc3d5698SJohn Baldwin	str	w12,[x0,#16*0]	// r0
213bc3d5698SJohn Baldwin	add	w12,w13,w13,lsl#2	// r1*5
214bc3d5698SJohn Baldwin	str	w13,[x0,#16*1]	// r1
215bc3d5698SJohn Baldwin	add	w13,w14,w14,lsl#2	// r2*5
216bc3d5698SJohn Baldwin	str	w12,[x0,#16*2]	// s1
217bc3d5698SJohn Baldwin	str	w14,[x0,#16*3]	// r2
218bc3d5698SJohn Baldwin	add	w14,w15,w15,lsl#2	// r3*5
219bc3d5698SJohn Baldwin	str	w13,[x0,#16*4]	// s2
220bc3d5698SJohn Baldwin	str	w15,[x0,#16*5]	// r3
221bc3d5698SJohn Baldwin	add	w15,w16,w16,lsl#2	// r4*5
222bc3d5698SJohn Baldwin	str	w14,[x0,#16*6]	// s3
223bc3d5698SJohn Baldwin	str	w16,[x0,#16*7]	// r4
224bc3d5698SJohn Baldwin	str	w15,[x0,#16*8]	// s4
225bc3d5698SJohn Baldwin
226bc3d5698SJohn Baldwin	ret
227bc3d5698SJohn Baldwin.size	poly1305_splat,.-poly1305_splat
228bc3d5698SJohn Baldwin
229bc3d5698SJohn Baldwin.type	poly1305_blocks_neon,%function
230bc3d5698SJohn Baldwin.align	5
231bc3d5698SJohn Baldwinpoly1305_blocks_neon:
232c0855eaaSJohn Baldwin.Lpoly1305_blocks_neon:
233bd9588bcSAndrew Turner	// The symbol .Lpoly1305_blocks_neon is not a .globl symbol
234bd9588bcSAndrew Turner	// but a pointer to it is returned by poly1305_init
235bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
236bc3d5698SJohn Baldwin	ldr	x17,[x0,#24]
237bc3d5698SJohn Baldwin	cmp	x2,#128
238bc3d5698SJohn Baldwin	b.hs	.Lblocks_neon
239c0855eaaSJohn Baldwin	cbz	x17,.Lpoly1305_blocks
240bc3d5698SJohn Baldwin
241bc3d5698SJohn Baldwin.Lblocks_neon:
242bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
243bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-80]!
244bc3d5698SJohn Baldwin	add	x29,sp,#0
245bc3d5698SJohn Baldwin
246bc3d5698SJohn Baldwin	ands	x2,x2,#-16
247bc3d5698SJohn Baldwin	b.eq	.Lno_data_neon
248bc3d5698SJohn Baldwin
249bc3d5698SJohn Baldwin	cbz	x17,.Lbase2_64_neon
250bc3d5698SJohn Baldwin
251bc3d5698SJohn Baldwin	ldp	w10,w11,[x0]		// load hash value base 2^26
252bc3d5698SJohn Baldwin	ldp	w12,w13,[x0,#8]
253bc3d5698SJohn Baldwin	ldr	w14,[x0,#16]
254bc3d5698SJohn Baldwin
255bc3d5698SJohn Baldwin	tst	x2,#31
256bc3d5698SJohn Baldwin	b.eq	.Leven_neon
257bc3d5698SJohn Baldwin
258bc3d5698SJohn Baldwin	ldp	x7,x8,[x0,#32]	// load key value
259bc3d5698SJohn Baldwin
260bc3d5698SJohn Baldwin	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
261bc3d5698SJohn Baldwin	lsr	x5,x12,#12
262bc3d5698SJohn Baldwin	adds	x4,x4,x12,lsl#52
263bc3d5698SJohn Baldwin	add	x5,x5,x13,lsl#14
264bc3d5698SJohn Baldwin	adc	x5,x5,xzr
265bc3d5698SJohn Baldwin	lsr	x6,x14,#24
266bc3d5698SJohn Baldwin	adds	x5,x5,x14,lsl#40
267bc3d5698SJohn Baldwin	adc	x14,x6,xzr		// can be partially reduced...
268bc3d5698SJohn Baldwin
269bc3d5698SJohn Baldwin	ldp	x12,x13,[x1],#16	// load input
270bc3d5698SJohn Baldwin	sub	x2,x2,#16
271bc3d5698SJohn Baldwin	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
272bc3d5698SJohn Baldwin
273bc3d5698SJohn Baldwin	and	x10,x14,#-4		// ... so reduce
274bc3d5698SJohn Baldwin	and	x6,x14,#3
275bc3d5698SJohn Baldwin	add	x10,x10,x14,lsr#2
276bc3d5698SJohn Baldwin	adds	x4,x4,x10
277bc3d5698SJohn Baldwin	adcs	x5,x5,xzr
278bc3d5698SJohn Baldwin	adc	x6,x6,xzr
279bc3d5698SJohn Baldwin
280575878a5SEd Maste#ifdef	__AARCH64EB__
281bc3d5698SJohn Baldwin	rev	x12,x12
282bc3d5698SJohn Baldwin	rev	x13,x13
283bc3d5698SJohn Baldwin#endif
284bc3d5698SJohn Baldwin	adds	x4,x4,x12		// accumulate input
285bc3d5698SJohn Baldwin	adcs	x5,x5,x13
286bc3d5698SJohn Baldwin	adc	x6,x6,x3
287bc3d5698SJohn Baldwin
288bc3d5698SJohn Baldwin	bl	poly1305_mult
289bc3d5698SJohn Baldwin	ldr	x30,[sp,#8]
290bc3d5698SJohn Baldwin
291bc3d5698SJohn Baldwin	cbz	x3,.Lstore_base2_64_neon
292bc3d5698SJohn Baldwin
293bc3d5698SJohn Baldwin	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
294bc3d5698SJohn Baldwin	ubfx	x11,x4,#26,#26
295bc3d5698SJohn Baldwin	extr	x12,x5,x4,#52
296bc3d5698SJohn Baldwin	and	x12,x12,#0x03ffffff
297bc3d5698SJohn Baldwin	ubfx	x13,x5,#14,#26
298bc3d5698SJohn Baldwin	extr	x14,x6,x5,#40
299bc3d5698SJohn Baldwin
300bc3d5698SJohn Baldwin	cbnz	x2,.Leven_neon
301bc3d5698SJohn Baldwin
302bc3d5698SJohn Baldwin	stp	w10,w11,[x0]		// store hash value base 2^26
303bc3d5698SJohn Baldwin	stp	w12,w13,[x0,#8]
304bc3d5698SJohn Baldwin	str	w14,[x0,#16]
305bc3d5698SJohn Baldwin	b	.Lno_data_neon
306bc3d5698SJohn Baldwin
307bc3d5698SJohn Baldwin.align	4
308bc3d5698SJohn Baldwin.Lstore_base2_64_neon:
309bc3d5698SJohn Baldwin	stp	x4,x5,[x0]		// store hash value base 2^64
310bc3d5698SJohn Baldwin	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
311bc3d5698SJohn Baldwin	b	.Lno_data_neon
312bc3d5698SJohn Baldwin
313bc3d5698SJohn Baldwin.align	4
314bc3d5698SJohn Baldwin.Lbase2_64_neon:
315bc3d5698SJohn Baldwin	ldp	x7,x8,[x0,#32]	// load key value
316bc3d5698SJohn Baldwin
317bc3d5698SJohn Baldwin	ldp	x4,x5,[x0]		// load hash value base 2^64
318bc3d5698SJohn Baldwin	ldr	x6,[x0,#16]
319bc3d5698SJohn Baldwin
320bc3d5698SJohn Baldwin	tst	x2,#31
321bc3d5698SJohn Baldwin	b.eq	.Linit_neon
322bc3d5698SJohn Baldwin
323bc3d5698SJohn Baldwin	ldp	x12,x13,[x1],#16	// load input
324bc3d5698SJohn Baldwin	sub	x2,x2,#16
325bc3d5698SJohn Baldwin	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
326575878a5SEd Maste#ifdef	__AARCH64EB__
327bc3d5698SJohn Baldwin	rev	x12,x12
328bc3d5698SJohn Baldwin	rev	x13,x13
329bc3d5698SJohn Baldwin#endif
330bc3d5698SJohn Baldwin	adds	x4,x4,x12		// accumulate input
331bc3d5698SJohn Baldwin	adcs	x5,x5,x13
332bc3d5698SJohn Baldwin	adc	x6,x6,x3
333bc3d5698SJohn Baldwin
334bc3d5698SJohn Baldwin	bl	poly1305_mult
335bc3d5698SJohn Baldwin
336bc3d5698SJohn Baldwin.Linit_neon:
337bc3d5698SJohn Baldwin	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
338bc3d5698SJohn Baldwin	ubfx	x11,x4,#26,#26
339bc3d5698SJohn Baldwin	extr	x12,x5,x4,#52
340bc3d5698SJohn Baldwin	and	x12,x12,#0x03ffffff
341bc3d5698SJohn Baldwin	ubfx	x13,x5,#14,#26
342bc3d5698SJohn Baldwin	extr	x14,x6,x5,#40
343bc3d5698SJohn Baldwin
344bc3d5698SJohn Baldwin	stp	d8,d9,[sp,#16]		// meet ABI requirements
345bc3d5698SJohn Baldwin	stp	d10,d11,[sp,#32]
346bc3d5698SJohn Baldwin	stp	d12,d13,[sp,#48]
347bc3d5698SJohn Baldwin	stp	d14,d15,[sp,#64]
348bc3d5698SJohn Baldwin
349bc3d5698SJohn Baldwin	fmov	d24,x10
350bc3d5698SJohn Baldwin	fmov	d25,x11
351bc3d5698SJohn Baldwin	fmov	d26,x12
352bc3d5698SJohn Baldwin	fmov	d27,x13
353bc3d5698SJohn Baldwin	fmov	d28,x14
354bc3d5698SJohn Baldwin
355bc3d5698SJohn Baldwin	////////////////////////////////// initialize r^n table
356bc3d5698SJohn Baldwin	mov	x4,x7			// r^1
357bc3d5698SJohn Baldwin	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
358bc3d5698SJohn Baldwin	mov	x5,x8
359bc3d5698SJohn Baldwin	mov	x6,xzr
360bc3d5698SJohn Baldwin	add	x0,x0,#48+12
361bc3d5698SJohn Baldwin	bl	poly1305_splat
362bc3d5698SJohn Baldwin
363bc3d5698SJohn Baldwin	bl	poly1305_mult		// r^2
364bc3d5698SJohn Baldwin	sub	x0,x0,#4
365bc3d5698SJohn Baldwin	bl	poly1305_splat
366bc3d5698SJohn Baldwin
367bc3d5698SJohn Baldwin	bl	poly1305_mult		// r^3
368bc3d5698SJohn Baldwin	sub	x0,x0,#4
369bc3d5698SJohn Baldwin	bl	poly1305_splat
370bc3d5698SJohn Baldwin
371bc3d5698SJohn Baldwin	bl	poly1305_mult		// r^4
372bc3d5698SJohn Baldwin	sub	x0,x0,#4
373bc3d5698SJohn Baldwin	bl	poly1305_splat
374bc3d5698SJohn Baldwin	ldr	x30,[sp,#8]
375bc3d5698SJohn Baldwin
376bc3d5698SJohn Baldwin	add	x16,x1,#32
377bc3d5698SJohn Baldwin	adr	x17,.Lzeros
378bc3d5698SJohn Baldwin	subs	x2,x2,#64
379bc3d5698SJohn Baldwin	csel	x16,x17,x16,lo
380bc3d5698SJohn Baldwin
381bc3d5698SJohn Baldwin	mov	x4,#1
382c0855eaaSJohn Baldwin	stur	x4,[x0,#-24]		// set is_base2_26
383bc3d5698SJohn Baldwin	sub	x0,x0,#48		// restore original x0
384bc3d5698SJohn Baldwin	b	.Ldo_neon
385bc3d5698SJohn Baldwin
386bc3d5698SJohn Baldwin.align	4
387bc3d5698SJohn Baldwin.Leven_neon:
388bc3d5698SJohn Baldwin	add	x16,x1,#32
389bc3d5698SJohn Baldwin	adr	x17,.Lzeros
390bc3d5698SJohn Baldwin	subs	x2,x2,#64
391bc3d5698SJohn Baldwin	csel	x16,x17,x16,lo
392bc3d5698SJohn Baldwin
393bc3d5698SJohn Baldwin	stp	d8,d9,[sp,#16]		// meet ABI requirements
394bc3d5698SJohn Baldwin	stp	d10,d11,[sp,#32]
395bc3d5698SJohn Baldwin	stp	d12,d13,[sp,#48]
396bc3d5698SJohn Baldwin	stp	d14,d15,[sp,#64]
397bc3d5698SJohn Baldwin
398bc3d5698SJohn Baldwin	fmov	d24,x10
399bc3d5698SJohn Baldwin	fmov	d25,x11
400bc3d5698SJohn Baldwin	fmov	d26,x12
401bc3d5698SJohn Baldwin	fmov	d27,x13
402bc3d5698SJohn Baldwin	fmov	d28,x14
403bc3d5698SJohn Baldwin
404bc3d5698SJohn Baldwin.Ldo_neon:
405bc3d5698SJohn Baldwin	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
406bc3d5698SJohn Baldwin	ldp	x9,x13,[x16],#48
407bc3d5698SJohn Baldwin
408bc3d5698SJohn Baldwin	lsl	x3,x3,#24
409bc3d5698SJohn Baldwin	add	x15,x0,#48
410bc3d5698SJohn Baldwin
411575878a5SEd Maste#ifdef	__AARCH64EB__
412bc3d5698SJohn Baldwin	rev	x8,x8
413bc3d5698SJohn Baldwin	rev	x12,x12
414bc3d5698SJohn Baldwin	rev	x9,x9
415bc3d5698SJohn Baldwin	rev	x13,x13
416bc3d5698SJohn Baldwin#endif
417bc3d5698SJohn Baldwin	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
418bc3d5698SJohn Baldwin	and	x5,x9,#0x03ffffff
419bc3d5698SJohn Baldwin	ubfx	x6,x8,#26,#26
420bc3d5698SJohn Baldwin	ubfx	x7,x9,#26,#26
421bc3d5698SJohn Baldwin	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
422bc3d5698SJohn Baldwin	extr	x8,x12,x8,#52
423bc3d5698SJohn Baldwin	extr	x9,x13,x9,#52
424bc3d5698SJohn Baldwin	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
425bc3d5698SJohn Baldwin	fmov	d14,x4
426bc3d5698SJohn Baldwin	and	x8,x8,#0x03ffffff
427bc3d5698SJohn Baldwin	and	x9,x9,#0x03ffffff
428bc3d5698SJohn Baldwin	ubfx	x10,x12,#14,#26
429bc3d5698SJohn Baldwin	ubfx	x11,x13,#14,#26
430bc3d5698SJohn Baldwin	add	x12,x3,x12,lsr#40
431bc3d5698SJohn Baldwin	add	x13,x3,x13,lsr#40
432bc3d5698SJohn Baldwin	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
433bc3d5698SJohn Baldwin	fmov	d15,x6
434bc3d5698SJohn Baldwin	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
435bc3d5698SJohn Baldwin	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
436bc3d5698SJohn Baldwin	fmov	d16,x8
437bc3d5698SJohn Baldwin	fmov	d17,x10
438bc3d5698SJohn Baldwin	fmov	d18,x12
439bc3d5698SJohn Baldwin
440bc3d5698SJohn Baldwin	ldp	x8,x12,[x1],#16	// inp[0:1]
441bc3d5698SJohn Baldwin	ldp	x9,x13,[x1],#48
442bc3d5698SJohn Baldwin
443bc3d5698SJohn Baldwin	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
444bc3d5698SJohn Baldwin	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
445bc3d5698SJohn Baldwin	ld1	{v8.4s},[x15]
446bc3d5698SJohn Baldwin
447575878a5SEd Maste#ifdef	__AARCH64EB__
448bc3d5698SJohn Baldwin	rev	x8,x8
449bc3d5698SJohn Baldwin	rev	x12,x12
450bc3d5698SJohn Baldwin	rev	x9,x9
451bc3d5698SJohn Baldwin	rev	x13,x13
452bc3d5698SJohn Baldwin#endif
453bc3d5698SJohn Baldwin	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
454bc3d5698SJohn Baldwin	and	x5,x9,#0x03ffffff
455bc3d5698SJohn Baldwin	ubfx	x6,x8,#26,#26
456bc3d5698SJohn Baldwin	ubfx	x7,x9,#26,#26
457bc3d5698SJohn Baldwin	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
458bc3d5698SJohn Baldwin	extr	x8,x12,x8,#52
459bc3d5698SJohn Baldwin	extr	x9,x13,x9,#52
460bc3d5698SJohn Baldwin	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
461bc3d5698SJohn Baldwin	fmov	d9,x4
462bc3d5698SJohn Baldwin	and	x8,x8,#0x03ffffff
463bc3d5698SJohn Baldwin	and	x9,x9,#0x03ffffff
464bc3d5698SJohn Baldwin	ubfx	x10,x12,#14,#26
465bc3d5698SJohn Baldwin	ubfx	x11,x13,#14,#26
466bc3d5698SJohn Baldwin	add	x12,x3,x12,lsr#40
467bc3d5698SJohn Baldwin	add	x13,x3,x13,lsr#40
468bc3d5698SJohn Baldwin	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
469bc3d5698SJohn Baldwin	fmov	d10,x6
470bc3d5698SJohn Baldwin	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
471bc3d5698SJohn Baldwin	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
472bc3d5698SJohn Baldwin	movi	v31.2d,#-1
473bc3d5698SJohn Baldwin	fmov	d11,x8
474bc3d5698SJohn Baldwin	fmov	d12,x10
475bc3d5698SJohn Baldwin	fmov	d13,x12
476bc3d5698SJohn Baldwin	ushr	v31.2d,v31.2d,#38
477bc3d5698SJohn Baldwin
478bc3d5698SJohn Baldwin	b.ls	.Lskip_loop
479bc3d5698SJohn Baldwin
480bc3d5698SJohn Baldwin.align	4
481bc3d5698SJohn Baldwin.Loop_neon:
482bc3d5698SJohn Baldwin	////////////////////////////////////////////////////////////////
483bc3d5698SJohn Baldwin	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
484bc3d5698SJohn Baldwin	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
485bc3d5698SJohn Baldwin	//   ___________________/
486bc3d5698SJohn Baldwin	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
487bc3d5698SJohn Baldwin	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
488bc3d5698SJohn Baldwin	//   ___________________/ ____________________/
489bc3d5698SJohn Baldwin	//
490bc3d5698SJohn Baldwin	// Note that we start with inp[2:3]*r^2. This is because it
491bc3d5698SJohn Baldwin	// doesn't depend on reduction in previous iteration.
492bc3d5698SJohn Baldwin	////////////////////////////////////////////////////////////////
493bc3d5698SJohn Baldwin	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
494bc3d5698SJohn Baldwin	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
495bc3d5698SJohn Baldwin	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
496bc3d5698SJohn Baldwin	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
497bc3d5698SJohn Baldwin	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
498bc3d5698SJohn Baldwin
499bc3d5698SJohn Baldwin	subs	x2,x2,#64
500bc3d5698SJohn Baldwin	umull	v23.2d,v14.2s,v7.s[2]
501bc3d5698SJohn Baldwin	csel	x16,x17,x16,lo
502bc3d5698SJohn Baldwin	umull	v22.2d,v14.2s,v5.s[2]
503bc3d5698SJohn Baldwin	umull	v21.2d,v14.2s,v3.s[2]
504bc3d5698SJohn Baldwin	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
505bc3d5698SJohn Baldwin	umull	v20.2d,v14.2s,v1.s[2]
506bc3d5698SJohn Baldwin	ldp	x9,x13,[x16],#48
507bc3d5698SJohn Baldwin	umull	v19.2d,v14.2s,v0.s[2]
508575878a5SEd Maste#ifdef	__AARCH64EB__
509bc3d5698SJohn Baldwin	rev	x8,x8
510bc3d5698SJohn Baldwin	rev	x12,x12
511bc3d5698SJohn Baldwin	rev	x9,x9
512bc3d5698SJohn Baldwin	rev	x13,x13
513bc3d5698SJohn Baldwin#endif
514bc3d5698SJohn Baldwin
515bc3d5698SJohn Baldwin	umlal	v23.2d,v15.2s,v5.s[2]
516bc3d5698SJohn Baldwin	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
517bc3d5698SJohn Baldwin	umlal	v22.2d,v15.2s,v3.s[2]
518bc3d5698SJohn Baldwin	and	x5,x9,#0x03ffffff
519bc3d5698SJohn Baldwin	umlal	v21.2d,v15.2s,v1.s[2]
520bc3d5698SJohn Baldwin	ubfx	x6,x8,#26,#26
521bc3d5698SJohn Baldwin	umlal	v20.2d,v15.2s,v0.s[2]
522bc3d5698SJohn Baldwin	ubfx	x7,x9,#26,#26
523bc3d5698SJohn Baldwin	umlal	v19.2d,v15.2s,v8.s[2]
524bc3d5698SJohn Baldwin	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
525bc3d5698SJohn Baldwin
526bc3d5698SJohn Baldwin	umlal	v23.2d,v16.2s,v3.s[2]
527bc3d5698SJohn Baldwin	extr	x8,x12,x8,#52
528bc3d5698SJohn Baldwin	umlal	v22.2d,v16.2s,v1.s[2]
529bc3d5698SJohn Baldwin	extr	x9,x13,x9,#52
530bc3d5698SJohn Baldwin	umlal	v21.2d,v16.2s,v0.s[2]
531bc3d5698SJohn Baldwin	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
532bc3d5698SJohn Baldwin	umlal	v20.2d,v16.2s,v8.s[2]
533bc3d5698SJohn Baldwin	fmov	d14,x4
534bc3d5698SJohn Baldwin	umlal	v19.2d,v16.2s,v6.s[2]
535bc3d5698SJohn Baldwin	and	x8,x8,#0x03ffffff
536bc3d5698SJohn Baldwin
537bc3d5698SJohn Baldwin	umlal	v23.2d,v17.2s,v1.s[2]
538bc3d5698SJohn Baldwin	and	x9,x9,#0x03ffffff
539bc3d5698SJohn Baldwin	umlal	v22.2d,v17.2s,v0.s[2]
540bc3d5698SJohn Baldwin	ubfx	x10,x12,#14,#26
541bc3d5698SJohn Baldwin	umlal	v21.2d,v17.2s,v8.s[2]
542bc3d5698SJohn Baldwin	ubfx	x11,x13,#14,#26
543bc3d5698SJohn Baldwin	umlal	v20.2d,v17.2s,v6.s[2]
544bc3d5698SJohn Baldwin	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
545bc3d5698SJohn Baldwin	umlal	v19.2d,v17.2s,v4.s[2]
546bc3d5698SJohn Baldwin	fmov	d15,x6
547bc3d5698SJohn Baldwin
548bc3d5698SJohn Baldwin	add	v11.2s,v11.2s,v26.2s
549bc3d5698SJohn Baldwin	add	x12,x3,x12,lsr#40
550bc3d5698SJohn Baldwin	umlal	v23.2d,v18.2s,v0.s[2]
551bc3d5698SJohn Baldwin	add	x13,x3,x13,lsr#40
552bc3d5698SJohn Baldwin	umlal	v22.2d,v18.2s,v8.s[2]
553bc3d5698SJohn Baldwin	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
554bc3d5698SJohn Baldwin	umlal	v21.2d,v18.2s,v6.s[2]
555bc3d5698SJohn Baldwin	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
556bc3d5698SJohn Baldwin	umlal	v20.2d,v18.2s,v4.s[2]
557bc3d5698SJohn Baldwin	fmov	d16,x8
558bc3d5698SJohn Baldwin	umlal	v19.2d,v18.2s,v2.s[2]
559bc3d5698SJohn Baldwin	fmov	d17,x10
560bc3d5698SJohn Baldwin
561bc3d5698SJohn Baldwin	////////////////////////////////////////////////////////////////
562bc3d5698SJohn Baldwin	// (hash+inp[0:1])*r^4 and accumulate
563bc3d5698SJohn Baldwin
564bc3d5698SJohn Baldwin	add	v9.2s,v9.2s,v24.2s
565bc3d5698SJohn Baldwin	fmov	d18,x12
566bc3d5698SJohn Baldwin	umlal	v22.2d,v11.2s,v1.s[0]
567bc3d5698SJohn Baldwin	ldp	x8,x12,[x1],#16	// inp[0:1]
568bc3d5698SJohn Baldwin	umlal	v19.2d,v11.2s,v6.s[0]
569bc3d5698SJohn Baldwin	ldp	x9,x13,[x1],#48
570bc3d5698SJohn Baldwin	umlal	v23.2d,v11.2s,v3.s[0]
571bc3d5698SJohn Baldwin	umlal	v20.2d,v11.2s,v8.s[0]
572bc3d5698SJohn Baldwin	umlal	v21.2d,v11.2s,v0.s[0]
573575878a5SEd Maste#ifdef	__AARCH64EB__
574bc3d5698SJohn Baldwin	rev	x8,x8
575bc3d5698SJohn Baldwin	rev	x12,x12
576bc3d5698SJohn Baldwin	rev	x9,x9
577bc3d5698SJohn Baldwin	rev	x13,x13
578bc3d5698SJohn Baldwin#endif
579bc3d5698SJohn Baldwin
580bc3d5698SJohn Baldwin	add	v10.2s,v10.2s,v25.2s
581bc3d5698SJohn Baldwin	umlal	v22.2d,v9.2s,v5.s[0]
582bc3d5698SJohn Baldwin	umlal	v23.2d,v9.2s,v7.s[0]
583bc3d5698SJohn Baldwin	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
584bc3d5698SJohn Baldwin	umlal	v21.2d,v9.2s,v3.s[0]
585bc3d5698SJohn Baldwin	and	x5,x9,#0x03ffffff
586bc3d5698SJohn Baldwin	umlal	v19.2d,v9.2s,v0.s[0]
587bc3d5698SJohn Baldwin	ubfx	x6,x8,#26,#26
588bc3d5698SJohn Baldwin	umlal	v20.2d,v9.2s,v1.s[0]
589bc3d5698SJohn Baldwin	ubfx	x7,x9,#26,#26
590bc3d5698SJohn Baldwin
591bc3d5698SJohn Baldwin	add	v12.2s,v12.2s,v27.2s
592bc3d5698SJohn Baldwin	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
593bc3d5698SJohn Baldwin	umlal	v22.2d,v10.2s,v3.s[0]
594bc3d5698SJohn Baldwin	extr	x8,x12,x8,#52
595bc3d5698SJohn Baldwin	umlal	v23.2d,v10.2s,v5.s[0]
596bc3d5698SJohn Baldwin	extr	x9,x13,x9,#52
597bc3d5698SJohn Baldwin	umlal	v19.2d,v10.2s,v8.s[0]
598bc3d5698SJohn Baldwin	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
599bc3d5698SJohn Baldwin	umlal	v21.2d,v10.2s,v1.s[0]
600bc3d5698SJohn Baldwin	fmov	d9,x4
601bc3d5698SJohn Baldwin	umlal	v20.2d,v10.2s,v0.s[0]
602bc3d5698SJohn Baldwin	and	x8,x8,#0x03ffffff
603bc3d5698SJohn Baldwin
604bc3d5698SJohn Baldwin	add	v13.2s,v13.2s,v28.2s
605bc3d5698SJohn Baldwin	and	x9,x9,#0x03ffffff
606bc3d5698SJohn Baldwin	umlal	v22.2d,v12.2s,v0.s[0]
607bc3d5698SJohn Baldwin	ubfx	x10,x12,#14,#26
608bc3d5698SJohn Baldwin	umlal	v19.2d,v12.2s,v4.s[0]
609bc3d5698SJohn Baldwin	ubfx	x11,x13,#14,#26
610bc3d5698SJohn Baldwin	umlal	v23.2d,v12.2s,v1.s[0]
611bc3d5698SJohn Baldwin	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
612bc3d5698SJohn Baldwin	umlal	v20.2d,v12.2s,v6.s[0]
613bc3d5698SJohn Baldwin	fmov	d10,x6
614bc3d5698SJohn Baldwin	umlal	v21.2d,v12.2s,v8.s[0]
615bc3d5698SJohn Baldwin	add	x12,x3,x12,lsr#40
616bc3d5698SJohn Baldwin
617bc3d5698SJohn Baldwin	umlal	v22.2d,v13.2s,v8.s[0]
618bc3d5698SJohn Baldwin	add	x13,x3,x13,lsr#40
619bc3d5698SJohn Baldwin	umlal	v19.2d,v13.2s,v2.s[0]
620bc3d5698SJohn Baldwin	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
621bc3d5698SJohn Baldwin	umlal	v23.2d,v13.2s,v0.s[0]
622bc3d5698SJohn Baldwin	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
623bc3d5698SJohn Baldwin	umlal	v20.2d,v13.2s,v4.s[0]
624bc3d5698SJohn Baldwin	fmov	d11,x8
625bc3d5698SJohn Baldwin	umlal	v21.2d,v13.2s,v6.s[0]
626bc3d5698SJohn Baldwin	fmov	d12,x10
627bc3d5698SJohn Baldwin	fmov	d13,x12
628bc3d5698SJohn Baldwin
629bc3d5698SJohn Baldwin	/////////////////////////////////////////////////////////////////
630bc3d5698SJohn Baldwin	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
631bc3d5698SJohn Baldwin	// and P. Schwabe
632bc3d5698SJohn Baldwin	//
633bc3d5698SJohn Baldwin	// [see discussion in poly1305-armv4 module]
634bc3d5698SJohn Baldwin
635bc3d5698SJohn Baldwin	ushr	v29.2d,v22.2d,#26
636bc3d5698SJohn Baldwin	xtn	v27.2s,v22.2d
637bc3d5698SJohn Baldwin	ushr	v30.2d,v19.2d,#26
638bc3d5698SJohn Baldwin	and	v19.16b,v19.16b,v31.16b
639bc3d5698SJohn Baldwin	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
640bc3d5698SJohn Baldwin	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
641bc3d5698SJohn Baldwin	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
642bc3d5698SJohn Baldwin
643bc3d5698SJohn Baldwin	ushr	v29.2d,v23.2d,#26
644bc3d5698SJohn Baldwin	xtn	v28.2s,v23.2d
645bc3d5698SJohn Baldwin	ushr	v30.2d,v20.2d,#26
646bc3d5698SJohn Baldwin	xtn	v25.2s,v20.2d
647bc3d5698SJohn Baldwin	bic	v28.2s,#0xfc,lsl#24
648bc3d5698SJohn Baldwin	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
649bc3d5698SJohn Baldwin
650bc3d5698SJohn Baldwin	add	v19.2d,v19.2d,v29.2d
651bc3d5698SJohn Baldwin	shl	v29.2d,v29.2d,#2
652bc3d5698SJohn Baldwin	shrn	v30.2s,v21.2d,#26
653bc3d5698SJohn Baldwin	xtn	v26.2s,v21.2d
654bc3d5698SJohn Baldwin	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
655bc3d5698SJohn Baldwin	bic	v25.2s,#0xfc,lsl#24
656bc3d5698SJohn Baldwin	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
657bc3d5698SJohn Baldwin	bic	v26.2s,#0xfc,lsl#24
658bc3d5698SJohn Baldwin
659bc3d5698SJohn Baldwin	shrn	v29.2s,v19.2d,#26
660bc3d5698SJohn Baldwin	xtn	v24.2s,v19.2d
661bc3d5698SJohn Baldwin	ushr	v30.2s,v27.2s,#26
662bc3d5698SJohn Baldwin	bic	v27.2s,#0xfc,lsl#24
663bc3d5698SJohn Baldwin	bic	v24.2s,#0xfc,lsl#24
664bc3d5698SJohn Baldwin	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
665bc3d5698SJohn Baldwin	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
666bc3d5698SJohn Baldwin
667bc3d5698SJohn Baldwin	b.hi	.Loop_neon
668bc3d5698SJohn Baldwin
669bc3d5698SJohn Baldwin.Lskip_loop:
670bc3d5698SJohn Baldwin	dup	v16.2d,v16.d[0]
671bc3d5698SJohn Baldwin	add	v11.2s,v11.2s,v26.2s
672bc3d5698SJohn Baldwin
673bc3d5698SJohn Baldwin	////////////////////////////////////////////////////////////////
674bc3d5698SJohn Baldwin	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
675bc3d5698SJohn Baldwin
676bc3d5698SJohn Baldwin	adds	x2,x2,#32
677bc3d5698SJohn Baldwin	b.ne	.Long_tail
678bc3d5698SJohn Baldwin
679bc3d5698SJohn Baldwin	dup	v16.2d,v11.d[0]
680bc3d5698SJohn Baldwin	add	v14.2s,v9.2s,v24.2s
681bc3d5698SJohn Baldwin	add	v17.2s,v12.2s,v27.2s
682bc3d5698SJohn Baldwin	add	v15.2s,v10.2s,v25.2s
683bc3d5698SJohn Baldwin	add	v18.2s,v13.2s,v28.2s
684bc3d5698SJohn Baldwin
685bc3d5698SJohn Baldwin.Long_tail:
686bc3d5698SJohn Baldwin	dup	v14.2d,v14.d[0]
687bc3d5698SJohn Baldwin	umull2	v19.2d,v16.4s,v6.4s
688bc3d5698SJohn Baldwin	umull2	v22.2d,v16.4s,v1.4s
689bc3d5698SJohn Baldwin	umull2	v23.2d,v16.4s,v3.4s
690bc3d5698SJohn Baldwin	umull2	v21.2d,v16.4s,v0.4s
691bc3d5698SJohn Baldwin	umull2	v20.2d,v16.4s,v8.4s
692bc3d5698SJohn Baldwin
693bc3d5698SJohn Baldwin	dup	v15.2d,v15.d[0]
694bc3d5698SJohn Baldwin	umlal2	v19.2d,v14.4s,v0.4s
695bc3d5698SJohn Baldwin	umlal2	v21.2d,v14.4s,v3.4s
696bc3d5698SJohn Baldwin	umlal2	v22.2d,v14.4s,v5.4s
697bc3d5698SJohn Baldwin	umlal2	v23.2d,v14.4s,v7.4s
698bc3d5698SJohn Baldwin	umlal2	v20.2d,v14.4s,v1.4s
699bc3d5698SJohn Baldwin
700bc3d5698SJohn Baldwin	dup	v17.2d,v17.d[0]
701bc3d5698SJohn Baldwin	umlal2	v19.2d,v15.4s,v8.4s
702bc3d5698SJohn Baldwin	umlal2	v22.2d,v15.4s,v3.4s
703bc3d5698SJohn Baldwin	umlal2	v21.2d,v15.4s,v1.4s
704bc3d5698SJohn Baldwin	umlal2	v23.2d,v15.4s,v5.4s
705bc3d5698SJohn Baldwin	umlal2	v20.2d,v15.4s,v0.4s
706bc3d5698SJohn Baldwin
707bc3d5698SJohn Baldwin	dup	v18.2d,v18.d[0]
708bc3d5698SJohn Baldwin	umlal2	v22.2d,v17.4s,v0.4s
709bc3d5698SJohn Baldwin	umlal2	v23.2d,v17.4s,v1.4s
710bc3d5698SJohn Baldwin	umlal2	v19.2d,v17.4s,v4.4s
711bc3d5698SJohn Baldwin	umlal2	v20.2d,v17.4s,v6.4s
712bc3d5698SJohn Baldwin	umlal2	v21.2d,v17.4s,v8.4s
713bc3d5698SJohn Baldwin
714bc3d5698SJohn Baldwin	umlal2	v22.2d,v18.4s,v8.4s
715bc3d5698SJohn Baldwin	umlal2	v19.2d,v18.4s,v2.4s
716bc3d5698SJohn Baldwin	umlal2	v23.2d,v18.4s,v0.4s
717bc3d5698SJohn Baldwin	umlal2	v20.2d,v18.4s,v4.4s
718bc3d5698SJohn Baldwin	umlal2	v21.2d,v18.4s,v6.4s
719bc3d5698SJohn Baldwin
720bc3d5698SJohn Baldwin	b.eq	.Lshort_tail
721bc3d5698SJohn Baldwin
722bc3d5698SJohn Baldwin	////////////////////////////////////////////////////////////////
723bc3d5698SJohn Baldwin	// (hash+inp[0:1])*r^4:r^3 and accumulate
724bc3d5698SJohn Baldwin
725bc3d5698SJohn Baldwin	add	v9.2s,v9.2s,v24.2s
726bc3d5698SJohn Baldwin	umlal	v22.2d,v11.2s,v1.2s
727bc3d5698SJohn Baldwin	umlal	v19.2d,v11.2s,v6.2s
728bc3d5698SJohn Baldwin	umlal	v23.2d,v11.2s,v3.2s
729bc3d5698SJohn Baldwin	umlal	v20.2d,v11.2s,v8.2s
730bc3d5698SJohn Baldwin	umlal	v21.2d,v11.2s,v0.2s
731bc3d5698SJohn Baldwin
732bc3d5698SJohn Baldwin	add	v10.2s,v10.2s,v25.2s
733bc3d5698SJohn Baldwin	umlal	v22.2d,v9.2s,v5.2s
734bc3d5698SJohn Baldwin	umlal	v19.2d,v9.2s,v0.2s
735bc3d5698SJohn Baldwin	umlal	v23.2d,v9.2s,v7.2s
736bc3d5698SJohn Baldwin	umlal	v20.2d,v9.2s,v1.2s
737bc3d5698SJohn Baldwin	umlal	v21.2d,v9.2s,v3.2s
738bc3d5698SJohn Baldwin
739bc3d5698SJohn Baldwin	add	v12.2s,v12.2s,v27.2s
740bc3d5698SJohn Baldwin	umlal	v22.2d,v10.2s,v3.2s
741bc3d5698SJohn Baldwin	umlal	v19.2d,v10.2s,v8.2s
742bc3d5698SJohn Baldwin	umlal	v23.2d,v10.2s,v5.2s
743bc3d5698SJohn Baldwin	umlal	v20.2d,v10.2s,v0.2s
744bc3d5698SJohn Baldwin	umlal	v21.2d,v10.2s,v1.2s
745bc3d5698SJohn Baldwin
746bc3d5698SJohn Baldwin	add	v13.2s,v13.2s,v28.2s
747bc3d5698SJohn Baldwin	umlal	v22.2d,v12.2s,v0.2s
748bc3d5698SJohn Baldwin	umlal	v19.2d,v12.2s,v4.2s
749bc3d5698SJohn Baldwin	umlal	v23.2d,v12.2s,v1.2s
750bc3d5698SJohn Baldwin	umlal	v20.2d,v12.2s,v6.2s
751bc3d5698SJohn Baldwin	umlal	v21.2d,v12.2s,v8.2s
752bc3d5698SJohn Baldwin
753bc3d5698SJohn Baldwin	umlal	v22.2d,v13.2s,v8.2s
754bc3d5698SJohn Baldwin	umlal	v19.2d,v13.2s,v2.2s
755bc3d5698SJohn Baldwin	umlal	v23.2d,v13.2s,v0.2s
756bc3d5698SJohn Baldwin	umlal	v20.2d,v13.2s,v4.2s
757bc3d5698SJohn Baldwin	umlal	v21.2d,v13.2s,v6.2s
758bc3d5698SJohn Baldwin
759bc3d5698SJohn Baldwin.Lshort_tail:
760bc3d5698SJohn Baldwin	////////////////////////////////////////////////////////////////
761bc3d5698SJohn Baldwin	// horizontal add
762bc3d5698SJohn Baldwin
763bc3d5698SJohn Baldwin	addp	v22.2d,v22.2d,v22.2d
764bc3d5698SJohn Baldwin	ldp	d8,d9,[sp,#16]		// meet ABI requirements
765bc3d5698SJohn Baldwin	addp	v19.2d,v19.2d,v19.2d
766bc3d5698SJohn Baldwin	ldp	d10,d11,[sp,#32]
767bc3d5698SJohn Baldwin	addp	v23.2d,v23.2d,v23.2d
768bc3d5698SJohn Baldwin	ldp	d12,d13,[sp,#48]
769bc3d5698SJohn Baldwin	addp	v20.2d,v20.2d,v20.2d
770bc3d5698SJohn Baldwin	ldp	d14,d15,[sp,#64]
771bc3d5698SJohn Baldwin	addp	v21.2d,v21.2d,v21.2d
772bc3d5698SJohn Baldwin
773bc3d5698SJohn Baldwin	////////////////////////////////////////////////////////////////
774bc3d5698SJohn Baldwin	// lazy reduction, but without narrowing
775bc3d5698SJohn Baldwin
776bc3d5698SJohn Baldwin	ushr	v29.2d,v22.2d,#26
777bc3d5698SJohn Baldwin	and	v22.16b,v22.16b,v31.16b
778bc3d5698SJohn Baldwin	ushr	v30.2d,v19.2d,#26
779bc3d5698SJohn Baldwin	and	v19.16b,v19.16b,v31.16b
780bc3d5698SJohn Baldwin
781bc3d5698SJohn Baldwin	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
782bc3d5698SJohn Baldwin	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
783bc3d5698SJohn Baldwin
784bc3d5698SJohn Baldwin	ushr	v29.2d,v23.2d,#26
785bc3d5698SJohn Baldwin	and	v23.16b,v23.16b,v31.16b
786bc3d5698SJohn Baldwin	ushr	v30.2d,v20.2d,#26
787bc3d5698SJohn Baldwin	and	v20.16b,v20.16b,v31.16b
788bc3d5698SJohn Baldwin	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
789bc3d5698SJohn Baldwin
790bc3d5698SJohn Baldwin	add	v19.2d,v19.2d,v29.2d
791bc3d5698SJohn Baldwin	shl	v29.2d,v29.2d,#2
792bc3d5698SJohn Baldwin	ushr	v30.2d,v21.2d,#26
793bc3d5698SJohn Baldwin	and	v21.16b,v21.16b,v31.16b
794bc3d5698SJohn Baldwin	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
795bc3d5698SJohn Baldwin	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
796bc3d5698SJohn Baldwin
797bc3d5698SJohn Baldwin	ushr	v29.2d,v19.2d,#26
798bc3d5698SJohn Baldwin	and	v19.16b,v19.16b,v31.16b
799bc3d5698SJohn Baldwin	ushr	v30.2d,v22.2d,#26
800bc3d5698SJohn Baldwin	and	v22.16b,v22.16b,v31.16b
801bc3d5698SJohn Baldwin	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
802bc3d5698SJohn Baldwin	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
803bc3d5698SJohn Baldwin
804bc3d5698SJohn Baldwin	////////////////////////////////////////////////////////////////
805bc3d5698SJohn Baldwin	// write the result, can be partially reduced
806bc3d5698SJohn Baldwin
807bc3d5698SJohn Baldwin	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
808bc3d5698SJohn Baldwin	st1	{v23.s}[0],[x0]
809bc3d5698SJohn Baldwin
810bc3d5698SJohn Baldwin.Lno_data_neon:
811bc3d5698SJohn Baldwin	ldr	x29,[sp],#80
812bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
813bc3d5698SJohn Baldwin	ret
814bc3d5698SJohn Baldwin.size	poly1305_blocks_neon,.-poly1305_blocks_neon
815bc3d5698SJohn Baldwin
816bc3d5698SJohn Baldwin.type	poly1305_emit_neon,%function
817bc3d5698SJohn Baldwin.align	5
818bc3d5698SJohn Baldwinpoly1305_emit_neon:
819c0855eaaSJohn Baldwin.Lpoly1305_emit_neon:
820bd9588bcSAndrew Turner	// The symbol .Lpoly1305_emit_neon is not a .globl symbol
821bd9588bcSAndrew Turner	// but a pointer to it is returned by poly1305_init
822bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
823bc3d5698SJohn Baldwin	ldr	x17,[x0,#24]
824bc3d5698SJohn Baldwin	cbz	x17,poly1305_emit
825bc3d5698SJohn Baldwin
826bc3d5698SJohn Baldwin	ldp	w10,w11,[x0]		// load hash value base 2^26
827bc3d5698SJohn Baldwin	ldp	w12,w13,[x0,#8]
828bc3d5698SJohn Baldwin	ldr	w14,[x0,#16]
829bc3d5698SJohn Baldwin
830bc3d5698SJohn Baldwin	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
831bc3d5698SJohn Baldwin	lsr	x5,x12,#12
832bc3d5698SJohn Baldwin	adds	x4,x4,x12,lsl#52
833bc3d5698SJohn Baldwin	add	x5,x5,x13,lsl#14
834bc3d5698SJohn Baldwin	adc	x5,x5,xzr
835bc3d5698SJohn Baldwin	lsr	x6,x14,#24
836bc3d5698SJohn Baldwin	adds	x5,x5,x14,lsl#40
837bc3d5698SJohn Baldwin	adc	x6,x6,xzr		// can be partially reduced...
838bc3d5698SJohn Baldwin
839bc3d5698SJohn Baldwin	ldp	x10,x11,[x2]	// load nonce
840bc3d5698SJohn Baldwin
841bc3d5698SJohn Baldwin	and	x12,x6,#-4		// ... so reduce
842bc3d5698SJohn Baldwin	add	x12,x12,x6,lsr#2
843bc3d5698SJohn Baldwin	and	x6,x6,#3
844bc3d5698SJohn Baldwin	adds	x4,x4,x12
845bc3d5698SJohn Baldwin	adcs	x5,x5,xzr
846bc3d5698SJohn Baldwin	adc	x6,x6,xzr
847bc3d5698SJohn Baldwin
848bc3d5698SJohn Baldwin	adds	x12,x4,#5		// compare to modulus
849bc3d5698SJohn Baldwin	adcs	x13,x5,xzr
850bc3d5698SJohn Baldwin	adc	x14,x6,xzr
851bc3d5698SJohn Baldwin
852bc3d5698SJohn Baldwin	tst	x14,#-4			// see if it's carried/borrowed
853bc3d5698SJohn Baldwin
854bc3d5698SJohn Baldwin	csel	x4,x4,x12,eq
855bc3d5698SJohn Baldwin	csel	x5,x5,x13,eq
856bc3d5698SJohn Baldwin
857575878a5SEd Maste#ifdef	__AARCH64EB__
858bc3d5698SJohn Baldwin	ror	x10,x10,#32		// flip nonce words
859bc3d5698SJohn Baldwin	ror	x11,x11,#32
860bc3d5698SJohn Baldwin#endif
861bc3d5698SJohn Baldwin	adds	x4,x4,x10		// accumulate nonce
862bc3d5698SJohn Baldwin	adc	x5,x5,x11
863575878a5SEd Maste#ifdef	__AARCH64EB__
864bc3d5698SJohn Baldwin	rev	x4,x4			// flip output bytes
865bc3d5698SJohn Baldwin	rev	x5,x5
866bc3d5698SJohn Baldwin#endif
867bc3d5698SJohn Baldwin	stp	x4,x5,[x1]		// write result
868bc3d5698SJohn Baldwin
869bc3d5698SJohn Baldwin	ret
870bc3d5698SJohn Baldwin.size	poly1305_emit_neon,.-poly1305_emit_neon
871bc3d5698SJohn Baldwin
872bc3d5698SJohn Baldwin.align	5
873bc3d5698SJohn Baldwin.Lzeros:
874bc3d5698SJohn Baldwin.long	0,0,0,0,0,0,0,0
875bc3d5698SJohn Baldwin.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
876bc3d5698SJohn Baldwin.align	2
877bc3d5698SJohn Baldwin.align	2
878