1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15#include <openssl/arm_arch.h>
16
17#if __ARM_MAX_ARCH__>=7
18.text
19
20.globl	_gcm_init_v8
21.private_extern	_gcm_init_v8
22
23.align	4
24_gcm_init_v8:
25	AARCH64_VALID_CALL_TARGET
26	ld1	{v17.2d},[x1]		//load input H
27	movi	v19.16b,#0xe1
28	shl	v19.2d,v19.2d,#57		//0xc2.0
29	ext	v3.16b,v17.16b,v17.16b,#8
30	ushr	v18.2d,v19.2d,#63
31	dup	v17.4s,v17.s[1]
32	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
33	ushr	v18.2d,v3.2d,#63
34	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
35	and	v18.16b,v18.16b,v16.16b
36	shl	v3.2d,v3.2d,#1
37	ext	v18.16b,v18.16b,v18.16b,#8
38	and	v16.16b,v16.16b,v17.16b
39	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
40	eor	v20.16b,v3.16b,v16.16b		//twisted H
41	st1	{v20.2d},[x0],#16		//store Htable[0]
42
43	//calculate H^2
44	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
45	pmull	v0.1q,v20.1d,v20.1d
46	eor	v16.16b,v16.16b,v20.16b
47	pmull2	v2.1q,v20.2d,v20.2d
48	pmull	v1.1q,v16.1d,v16.1d
49
50	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
51	eor	v18.16b,v0.16b,v2.16b
52	eor	v1.16b,v1.16b,v17.16b
53	eor	v1.16b,v1.16b,v18.16b
54	pmull	v18.1q,v0.1d,v19.1d		//1st phase
55
56	ins	v2.d[0],v1.d[1]
57	ins	v1.d[1],v0.d[0]
58	eor	v0.16b,v1.16b,v18.16b
59
60	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
61	pmull	v0.1q,v0.1d,v19.1d
62	eor	v18.16b,v18.16b,v2.16b
63	eor	v22.16b,v0.16b,v18.16b
64
65	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
66	eor	v17.16b,v17.16b,v22.16b
67	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
68	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
69	//calculate H^3 and H^4
70	pmull	v0.1q,v20.1d, v22.1d
71	pmull	v5.1q,v22.1d,v22.1d
72	pmull2	v2.1q,v20.2d, v22.2d
73	pmull2	v7.1q,v22.2d,v22.2d
74	pmull	v1.1q,v16.1d,v17.1d
75	pmull	v6.1q,v17.1d,v17.1d
76
77	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
78	ext	v17.16b,v5.16b,v7.16b,#8
79	eor	v18.16b,v0.16b,v2.16b
80	eor	v1.16b,v1.16b,v16.16b
81	eor	v4.16b,v5.16b,v7.16b
82	eor	v6.16b,v6.16b,v17.16b
83	eor	v1.16b,v1.16b,v18.16b
84	pmull	v18.1q,v0.1d,v19.1d		//1st phase
85	eor	v6.16b,v6.16b,v4.16b
86	pmull	v4.1q,v5.1d,v19.1d
87
88	ins	v2.d[0],v1.d[1]
89	ins	v7.d[0],v6.d[1]
90	ins	v1.d[1],v0.d[0]
91	ins	v6.d[1],v5.d[0]
92	eor	v0.16b,v1.16b,v18.16b
93	eor	v5.16b,v6.16b,v4.16b
94
95	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
96	ext	v4.16b,v5.16b,v5.16b,#8
97	pmull	v0.1q,v0.1d,v19.1d
98	pmull	v5.1q,v5.1d,v19.1d
99	eor	v18.16b,v18.16b,v2.16b
100	eor	v4.16b,v4.16b,v7.16b
101	eor	v20.16b, v0.16b,v18.16b		//H^3
102	eor	v22.16b,v5.16b,v4.16b		//H^4
103
104	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
105	ext	v17.16b,v22.16b,v22.16b,#8
106	eor	v16.16b,v16.16b,v20.16b
107	eor	v17.16b,v17.16b,v22.16b
108	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
109	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
110	ret
111
112.globl	_gcm_gmult_v8
113.private_extern	_gcm_gmult_v8
114
115.align	4
116_gcm_gmult_v8:
117	AARCH64_VALID_CALL_TARGET
118	ld1	{v17.2d},[x0]		//load Xi
119	movi	v19.16b,#0xe1
120	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
121	shl	v19.2d,v19.2d,#57
122#ifndef __ARMEB__
123	rev64	v17.16b,v17.16b
124#endif
125	ext	v3.16b,v17.16b,v17.16b,#8
126
127	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
128	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
129	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
130	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
131
132	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
133	eor	v18.16b,v0.16b,v2.16b
134	eor	v1.16b,v1.16b,v17.16b
135	eor	v1.16b,v1.16b,v18.16b
136	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
137
138	ins	v2.d[0],v1.d[1]
139	ins	v1.d[1],v0.d[0]
140	eor	v0.16b,v1.16b,v18.16b
141
142	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
143	pmull	v0.1q,v0.1d,v19.1d
144	eor	v18.16b,v18.16b,v2.16b
145	eor	v0.16b,v0.16b,v18.16b
146
147#ifndef __ARMEB__
148	rev64	v0.16b,v0.16b
149#endif
150	ext	v0.16b,v0.16b,v0.16b,#8
151	st1	{v0.2d},[x0]		//write out Xi
152
153	ret
154
155.globl	_gcm_ghash_v8
156.private_extern	_gcm_ghash_v8
157
158.align	4
159_gcm_ghash_v8:
160	AARCH64_VALID_CALL_TARGET
161	cmp	x3,#64
162	b.hs	Lgcm_ghash_v8_4x
163	ld1	{v0.2d},[x0]		//load [rotated] Xi
164						//"[rotated]" means that
165						//loaded value would have
166						//to be rotated in order to
167						//make it appear as in
168						//algorithm specification
169	subs	x3,x3,#32		//see if x3 is 32 or larger
170	mov	x12,#16		//x12 is used as post-
171						//increment for input pointer;
172						//as loop is modulo-scheduled
173						//x12 is zeroed just in time
174						//to preclude overstepping
175						//inp[len], which means that
176						//last block[s] are actually
177						//loaded twice, but last
178						//copy is not processed
179	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
180	movi	v19.16b,#0xe1
181	ld1	{v22.2d},[x1]
182	csel	x12,xzr,x12,eq			//is it time to zero x12?
183	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
184	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
185	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
186#ifndef __ARMEB__
187	rev64	v16.16b,v16.16b
188	rev64	v0.16b,v0.16b
189#endif
190	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
191	b.lo	Lodd_tail_v8		//x3 was less than 32
192	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
193#ifndef __ARMEB__
194	rev64	v17.16b,v17.16b
195#endif
196	ext	v7.16b,v17.16b,v17.16b,#8
197	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
198	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
199	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
200	pmull2	v6.1q,v20.2d,v7.2d
201	b	Loop_mod2x_v8
202
203.align	4
204Loop_mod2x_v8:
205	ext	v18.16b,v3.16b,v3.16b,#8
206	subs	x3,x3,#32		//is there more data?
207	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
208	csel	x12,xzr,x12,lo			//is it time to zero x12?
209
210	pmull	v5.1q,v21.1d,v17.1d
211	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
212	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
213	eor	v0.16b,v0.16b,v4.16b		//accumulate
214	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
215	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
216
217	eor	v2.16b,v2.16b,v6.16b
218	csel	x12,xzr,x12,eq			//is it time to zero x12?
219	eor	v1.16b,v1.16b,v5.16b
220
221	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
222	eor	v18.16b,v0.16b,v2.16b
223	eor	v1.16b,v1.16b,v17.16b
224	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
225#ifndef __ARMEB__
226	rev64	v16.16b,v16.16b
227#endif
228	eor	v1.16b,v1.16b,v18.16b
229	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
230
231#ifndef __ARMEB__
232	rev64	v17.16b,v17.16b
233#endif
234	ins	v2.d[0],v1.d[1]
235	ins	v1.d[1],v0.d[0]
236	ext	v7.16b,v17.16b,v17.16b,#8
237	ext	v3.16b,v16.16b,v16.16b,#8
238	eor	v0.16b,v1.16b,v18.16b
239	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
240	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
241
242	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
243	pmull	v0.1q,v0.1d,v19.1d
244	eor	v3.16b,v3.16b,v18.16b
245	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
246	eor	v3.16b,v3.16b,v0.16b
247	pmull2	v6.1q,v20.2d,v7.2d
248	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
249
250	eor	v2.16b,v2.16b,v18.16b
251	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
252	adds	x3,x3,#32		//re-construct x3
253	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
254	b.eq	Ldone_v8		//is x3 zero?
255Lodd_tail_v8:
256	ext	v18.16b,v0.16b,v0.16b,#8
257	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
258	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
259
260	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
261	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
262	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
263	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
264
265	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
266	eor	v18.16b,v0.16b,v2.16b
267	eor	v1.16b,v1.16b,v17.16b
268	eor	v1.16b,v1.16b,v18.16b
269	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
270
271	ins	v2.d[0],v1.d[1]
272	ins	v1.d[1],v0.d[0]
273	eor	v0.16b,v1.16b,v18.16b
274
275	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
276	pmull	v0.1q,v0.1d,v19.1d
277	eor	v18.16b,v18.16b,v2.16b
278	eor	v0.16b,v0.16b,v18.16b
279
280Ldone_v8:
281#ifndef __ARMEB__
282	rev64	v0.16b,v0.16b
283#endif
284	ext	v0.16b,v0.16b,v0.16b,#8
285	st1	{v0.2d},[x0]		//write out Xi
286
287	ret
288
289
290.align	4
291gcm_ghash_v8_4x:
292Lgcm_ghash_v8_4x:
293	ld1	{v0.2d},[x0]		//load [rotated] Xi
294	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
295	movi	v19.16b,#0xe1
296	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
297	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
298
299	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
300#ifndef __ARMEB__
301	rev64	v0.16b,v0.16b
302	rev64	v5.16b,v5.16b
303	rev64	v6.16b,v6.16b
304	rev64	v7.16b,v7.16b
305	rev64	v4.16b,v4.16b
306#endif
307	ext	v25.16b,v7.16b,v7.16b,#8
308	ext	v24.16b,v6.16b,v6.16b,#8
309	ext	v23.16b,v5.16b,v5.16b,#8
310
311	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
312	eor	v7.16b,v7.16b,v25.16b
313	pmull2	v31.1q,v20.2d,v25.2d
314	pmull	v30.1q,v21.1d,v7.1d
315
316	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
317	eor	v6.16b,v6.16b,v24.16b
318	pmull2	v24.1q,v22.2d,v24.2d
319	pmull2	v6.1q,v21.2d,v6.2d
320
321	eor	v29.16b,v29.16b,v16.16b
322	eor	v31.16b,v31.16b,v24.16b
323	eor	v30.16b,v30.16b,v6.16b
324
325	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
326	eor	v5.16b,v5.16b,v23.16b
327	pmull2	v23.1q,v26.2d,v23.2d
328	pmull	v5.1q,v27.1d,v5.1d
329
330	eor	v29.16b,v29.16b,v7.16b
331	eor	v31.16b,v31.16b,v23.16b
332	eor	v30.16b,v30.16b,v5.16b
333
334	subs	x3,x3,#128
335	b.lo	Ltail4x
336
337	b	Loop4x
338
339.align	4
340Loop4x:
341	eor	v16.16b,v4.16b,v0.16b
342	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
343	ext	v3.16b,v16.16b,v16.16b,#8
344#ifndef __ARMEB__
345	rev64	v5.16b,v5.16b
346	rev64	v6.16b,v6.16b
347	rev64	v7.16b,v7.16b
348	rev64	v4.16b,v4.16b
349#endif
350
351	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
352	eor	v16.16b,v16.16b,v3.16b
353	pmull2	v2.1q,v28.2d,v3.2d
354	ext	v25.16b,v7.16b,v7.16b,#8
355	pmull2	v1.1q,v27.2d,v16.2d
356
357	eor	v0.16b,v0.16b,v29.16b
358	eor	v2.16b,v2.16b,v31.16b
359	ext	v24.16b,v6.16b,v6.16b,#8
360	eor	v1.16b,v1.16b,v30.16b
361	ext	v23.16b,v5.16b,v5.16b,#8
362
363	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
364	eor	v18.16b,v0.16b,v2.16b
365	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
366	eor	v7.16b,v7.16b,v25.16b
367	eor	v1.16b,v1.16b,v17.16b
368	pmull2	v31.1q,v20.2d,v25.2d
369	eor	v1.16b,v1.16b,v18.16b
370	pmull	v30.1q,v21.1d,v7.1d
371
372	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
373	ins	v2.d[0],v1.d[1]
374	ins	v1.d[1],v0.d[0]
375	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
376	eor	v6.16b,v6.16b,v24.16b
377	pmull2	v24.1q,v22.2d,v24.2d
378	eor	v0.16b,v1.16b,v18.16b
379	pmull2	v6.1q,v21.2d,v6.2d
380
381	eor	v29.16b,v29.16b,v16.16b
382	eor	v31.16b,v31.16b,v24.16b
383	eor	v30.16b,v30.16b,v6.16b
384
385	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
386	pmull	v0.1q,v0.1d,v19.1d
387	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
388	eor	v5.16b,v5.16b,v23.16b
389	eor	v18.16b,v18.16b,v2.16b
390	pmull2	v23.1q,v26.2d,v23.2d
391	pmull	v5.1q,v27.1d,v5.1d
392
393	eor	v0.16b,v0.16b,v18.16b
394	eor	v29.16b,v29.16b,v7.16b
395	eor	v31.16b,v31.16b,v23.16b
396	ext	v0.16b,v0.16b,v0.16b,#8
397	eor	v30.16b,v30.16b,v5.16b
398
399	subs	x3,x3,#64
400	b.hs	Loop4x
401
402Ltail4x:
403	eor	v16.16b,v4.16b,v0.16b
404	ext	v3.16b,v16.16b,v16.16b,#8
405
406	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
407	eor	v16.16b,v16.16b,v3.16b
408	pmull2	v2.1q,v28.2d,v3.2d
409	pmull2	v1.1q,v27.2d,v16.2d
410
411	eor	v0.16b,v0.16b,v29.16b
412	eor	v2.16b,v2.16b,v31.16b
413	eor	v1.16b,v1.16b,v30.16b
414
415	adds	x3,x3,#64
416	b.eq	Ldone4x
417
418	cmp	x3,#32
419	b.lo	Lone
420	b.eq	Ltwo
421Lthree:
422	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
423	eor	v18.16b,v0.16b,v2.16b
424	eor	v1.16b,v1.16b,v17.16b
425	ld1	{v4.2d,v5.2d,v6.2d},[x2]
426	eor	v1.16b,v1.16b,v18.16b
427#ifndef	__ARMEB__
428	rev64	v5.16b,v5.16b
429	rev64	v6.16b,v6.16b
430	rev64	v4.16b,v4.16b
431#endif
432
433	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
434	ins	v2.d[0],v1.d[1]
435	ins	v1.d[1],v0.d[0]
436	ext	v24.16b,v6.16b,v6.16b,#8
437	ext	v23.16b,v5.16b,v5.16b,#8
438	eor	v0.16b,v1.16b,v18.16b
439
440	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
441	eor	v6.16b,v6.16b,v24.16b
442
443	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
444	pmull	v0.1q,v0.1d,v19.1d
445	eor	v18.16b,v18.16b,v2.16b
446	pmull2	v31.1q,v20.2d,v24.2d
447	pmull	v30.1q,v21.1d,v6.1d
448	eor	v0.16b,v0.16b,v18.16b
449	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
450	eor	v5.16b,v5.16b,v23.16b
451	ext	v0.16b,v0.16b,v0.16b,#8
452
453	pmull2	v23.1q,v22.2d,v23.2d
454	eor	v16.16b,v4.16b,v0.16b
455	pmull2	v5.1q,v21.2d,v5.2d
456	ext	v3.16b,v16.16b,v16.16b,#8
457
458	eor	v29.16b,v29.16b,v7.16b
459	eor	v31.16b,v31.16b,v23.16b
460	eor	v30.16b,v30.16b,v5.16b
461
462	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
463	eor	v16.16b,v16.16b,v3.16b
464	pmull2	v2.1q,v26.2d,v3.2d
465	pmull	v1.1q,v27.1d,v16.1d
466
467	eor	v0.16b,v0.16b,v29.16b
468	eor	v2.16b,v2.16b,v31.16b
469	eor	v1.16b,v1.16b,v30.16b
470	b	Ldone4x
471
472.align	4
473Ltwo:
474	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
475	eor	v18.16b,v0.16b,v2.16b
476	eor	v1.16b,v1.16b,v17.16b
477	ld1	{v4.2d,v5.2d},[x2]
478	eor	v1.16b,v1.16b,v18.16b
479#ifndef	__ARMEB__
480	rev64	v5.16b,v5.16b
481	rev64	v4.16b,v4.16b
482#endif
483
484	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
485	ins	v2.d[0],v1.d[1]
486	ins	v1.d[1],v0.d[0]
487	ext	v23.16b,v5.16b,v5.16b,#8
488	eor	v0.16b,v1.16b,v18.16b
489
490	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
491	pmull	v0.1q,v0.1d,v19.1d
492	eor	v18.16b,v18.16b,v2.16b
493	eor	v0.16b,v0.16b,v18.16b
494	ext	v0.16b,v0.16b,v0.16b,#8
495
496	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
497	eor	v5.16b,v5.16b,v23.16b
498
499	eor	v16.16b,v4.16b,v0.16b
500	ext	v3.16b,v16.16b,v16.16b,#8
501
502	pmull2	v31.1q,v20.2d,v23.2d
503	pmull	v30.1q,v21.1d,v5.1d
504
505	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
506	eor	v16.16b,v16.16b,v3.16b
507	pmull2	v2.1q,v22.2d,v3.2d
508	pmull2	v1.1q,v21.2d,v16.2d
509
510	eor	v0.16b,v0.16b,v29.16b
511	eor	v2.16b,v2.16b,v31.16b
512	eor	v1.16b,v1.16b,v30.16b
513	b	Ldone4x
514
515.align	4
516Lone:
517	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
518	eor	v18.16b,v0.16b,v2.16b
519	eor	v1.16b,v1.16b,v17.16b
520	ld1	{v4.2d},[x2]
521	eor	v1.16b,v1.16b,v18.16b
522#ifndef	__ARMEB__
523	rev64	v4.16b,v4.16b
524#endif
525
526	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
527	ins	v2.d[0],v1.d[1]
528	ins	v1.d[1],v0.d[0]
529	eor	v0.16b,v1.16b,v18.16b
530
531	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
532	pmull	v0.1q,v0.1d,v19.1d
533	eor	v18.16b,v18.16b,v2.16b
534	eor	v0.16b,v0.16b,v18.16b
535	ext	v0.16b,v0.16b,v0.16b,#8
536
537	eor	v16.16b,v4.16b,v0.16b
538	ext	v3.16b,v16.16b,v16.16b,#8
539
540	pmull	v0.1q,v20.1d,v3.1d
541	eor	v16.16b,v16.16b,v3.16b
542	pmull2	v2.1q,v20.2d,v3.2d
543	pmull	v1.1q,v21.1d,v16.1d
544
545Ldone4x:
546	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
547	eor	v18.16b,v0.16b,v2.16b
548	eor	v1.16b,v1.16b,v17.16b
549	eor	v1.16b,v1.16b,v18.16b
550
551	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
552	ins	v2.d[0],v1.d[1]
553	ins	v1.d[1],v0.d[0]
554	eor	v0.16b,v1.16b,v18.16b
555
556	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
557	pmull	v0.1q,v0.1d,v19.1d
558	eor	v18.16b,v18.16b,v2.16b
559	eor	v0.16b,v0.16b,v18.16b
560	ext	v0.16b,v0.16b,v0.16b,#8
561
562#ifndef __ARMEB__
563	rev64	v0.16b,v0.16b
564#endif
565	st1	{v0.2d},[x0]		//write out Xi
566
567	ret
568
569.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
570.align	2
571.align	2
572#endif
573#endif  // !OPENSSL_NO_ASM
574