1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# June 2015
18#
19# ChaCha20 for ARMv8.
20#
21# Performance in cycles per byte out of large buffer.
22#
23#			IALU/gcc-4.9    3xNEON+1xIALU	6xNEON+2xIALU
24#
25# Apple A7		5.50/+49%       3.33            1.70
26# Cortex-A53		8.40/+80%       4.72		4.72(*)
27# Cortex-A57		8.06/+43%       4.90            4.43(**)
28# Denver		4.50/+82%       2.63		2.67(*)
29# X-Gene		9.50/+46%       8.82		8.89(*)
30# Mongoose		8.00/+44%	3.64		3.25
31# Kryo			8.17/+50%	4.83		4.65
32#
33# (*)	it's expected that doubling interleave factor doesn't help
34#	all processors, only those with higher NEON latency and
35#	higher instruction issue rate;
36# (**)	expected improvement was actually higher;
37
38$flavour=shift;
39$output=shift;
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
44die "can't locate arm-xlate.pl";
45
46open OUT,"| \"$^X\" $xlate $flavour $output";
47*STDOUT=*OUT;
48
49sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
50{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
51  my $arg = pop;
52    $arg = "#$arg" if ($arg*1 eq $arg);
53    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
54}
55
56my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
57
58my @x=map("x$_",(5..17,19..21));
59my @d=map("x$_",(22..28,30));
60
61sub ROUND {
62my ($a0,$b0,$c0,$d0)=@_;
63my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
64my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
65my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
66
67    (
68	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
69	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
70	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
71	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
72	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
73	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
74	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
75	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
76	"&ror_32	(@x[$d0],@x[$d0],16)",
77	 "&ror_32	(@x[$d1],@x[$d1],16)",
78	  "&ror_32	(@x[$d2],@x[$d2],16)",
79	   "&ror_32	(@x[$d3],@x[$d3],16)",
80
81	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
82	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
83	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
84	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
85	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
86	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
87	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
88	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
89	"&ror_32	(@x[$b0],@x[$b0],20)",
90	 "&ror_32	(@x[$b1],@x[$b1],20)",
91	  "&ror_32	(@x[$b2],@x[$b2],20)",
92	   "&ror_32	(@x[$b3],@x[$b3],20)",
93
94	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
95	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
96	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
97	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
98	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
99	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
100	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
101	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
102	"&ror_32	(@x[$d0],@x[$d0],24)",
103	 "&ror_32	(@x[$d1],@x[$d1],24)",
104	  "&ror_32	(@x[$d2],@x[$d2],24)",
105	   "&ror_32	(@x[$d3],@x[$d3],24)",
106
107	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
108	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
109	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
110	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
111	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
112	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
113	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
114	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
115	"&ror_32	(@x[$b0],@x[$b0],25)",
116	 "&ror_32	(@x[$b1],@x[$b1],25)",
117	  "&ror_32	(@x[$b2],@x[$b2],25)",
118	   "&ror_32	(@x[$b3],@x[$b3],25)"
119    );
120}
121
122$code.=<<___;
123#include <GFp/arm_arch.h>
124
125.extern	GFp_armcap_P
126.hidden	GFp_armcap_P
127
128.section .rodata
129
130.align	5
131.Lsigma:
132.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
133.Lone:
134.long	1,0,0,0
135.asciz	"ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
136
137.text
138
139.globl	GFp_ChaCha20_ctr32
140.type	GFp_ChaCha20_ctr32,%function
141.align	5
142GFp_ChaCha20_ctr32:
143	AARCH64_VALID_CALL_TARGET
144	cbz	$len,.Labort
145#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
146	adrp	@x[0],:pg_hi21_nc:GFp_armcap_P
147#else
148	adrp	@x[0],:pg_hi21:GFp_armcap_P
149#endif
150	cmp	$len,#192
151	b.lo	.Lshort
152	ldr	w17,[@x[0],:lo12:GFp_armcap_P]
153	tst	w17,#ARMV7_NEON
154	b.ne	ChaCha20_neon
155
156.Lshort:
157	AARCH64_SIGN_LINK_REGISTER
158	stp	x29,x30,[sp,#-96]!
159	add	x29,sp,#0
160
161	adrp	@x[0],:pg_hi21:.Lsigma
162	add	@x[0],@x[0],:lo12:.Lsigma
163	stp	x19,x20,[sp,#16]
164	stp	x21,x22,[sp,#32]
165	stp	x23,x24,[sp,#48]
166	stp	x25,x26,[sp,#64]
167	stp	x27,x28,[sp,#80]
168	sub	sp,sp,#64
169
170	ldp	@d[0],@d[1],[@x[0]]		// load sigma
171	ldp	@d[2],@d[3],[$key]		// load key
172	ldp	@d[4],@d[5],[$key,#16]
173	ldp	@d[6],@d[7],[$ctr]		// load counter
174#ifdef	__ARMEB__
175	ror	@d[2],@d[2],#32
176	ror	@d[3],@d[3],#32
177	ror	@d[4],@d[4],#32
178	ror	@d[5],@d[5],#32
179	ror	@d[6],@d[6],#32
180	ror	@d[7],@d[7],#32
181#endif
182
183.Loop_outer:
184	mov.32	@x[0],@d[0]			// unpack key block
185	lsr	@x[1],@d[0],#32
186	mov.32	@x[2],@d[1]
187	lsr	@x[3],@d[1],#32
188	mov.32	@x[4],@d[2]
189	lsr	@x[5],@d[2],#32
190	mov.32	@x[6],@d[3]
191	lsr	@x[7],@d[3],#32
192	mov.32	@x[8],@d[4]
193	lsr	@x[9],@d[4],#32
194	mov.32	@x[10],@d[5]
195	lsr	@x[11],@d[5],#32
196	mov.32	@x[12],@d[6]
197	lsr	@x[13],@d[6],#32
198	mov.32	@x[14],@d[7]
199	lsr	@x[15],@d[7],#32
200
201	mov	$ctr,#10
202	subs	$len,$len,#64
203.Loop:
204	sub	$ctr,$ctr,#1
205___
206	foreach (&ROUND(0, 4, 8,12)) { eval; }
207	foreach (&ROUND(0, 5,10,15)) { eval; }
208$code.=<<___;
209	cbnz	$ctr,.Loop
210
211	add.32	@x[0],@x[0],@d[0]		// accumulate key block
212	add	@x[1],@x[1],@d[0],lsr#32
213	add.32	@x[2],@x[2],@d[1]
214	add	@x[3],@x[3],@d[1],lsr#32
215	add.32	@x[4],@x[4],@d[2]
216	add	@x[5],@x[5],@d[2],lsr#32
217	add.32	@x[6],@x[6],@d[3]
218	add	@x[7],@x[7],@d[3],lsr#32
219	add.32	@x[8],@x[8],@d[4]
220	add	@x[9],@x[9],@d[4],lsr#32
221	add.32	@x[10],@x[10],@d[5]
222	add	@x[11],@x[11],@d[5],lsr#32
223	add.32	@x[12],@x[12],@d[6]
224	add	@x[13],@x[13],@d[6],lsr#32
225	add.32	@x[14],@x[14],@d[7]
226	add	@x[15],@x[15],@d[7],lsr#32
227
228	b.lo	.Ltail
229
230	add	@x[0],@x[0],@x[1],lsl#32	// pack
231	add	@x[2],@x[2],@x[3],lsl#32
232	ldp	@x[1],@x[3],[$inp,#0]		// load input
233	add	@x[4],@x[4],@x[5],lsl#32
234	add	@x[6],@x[6],@x[7],lsl#32
235	ldp	@x[5],@x[7],[$inp,#16]
236	add	@x[8],@x[8],@x[9],lsl#32
237	add	@x[10],@x[10],@x[11],lsl#32
238	ldp	@x[9],@x[11],[$inp,#32]
239	add	@x[12],@x[12],@x[13],lsl#32
240	add	@x[14],@x[14],@x[15],lsl#32
241	ldp	@x[13],@x[15],[$inp,#48]
242	add	$inp,$inp,#64
243#ifdef	__ARMEB__
244	rev	@x[0],@x[0]
245	rev	@x[2],@x[2]
246	rev	@x[4],@x[4]
247	rev	@x[6],@x[6]
248	rev	@x[8],@x[8]
249	rev	@x[10],@x[10]
250	rev	@x[12],@x[12]
251	rev	@x[14],@x[14]
252#endif
253	eor	@x[0],@x[0],@x[1]
254	eor	@x[2],@x[2],@x[3]
255	eor	@x[4],@x[4],@x[5]
256	eor	@x[6],@x[6],@x[7]
257	eor	@x[8],@x[8],@x[9]
258	eor	@x[10],@x[10],@x[11]
259	eor	@x[12],@x[12],@x[13]
260	eor	@x[14],@x[14],@x[15]
261
262	stp	@x[0],@x[2],[$out,#0]		// store output
263	 add	@d[6],@d[6],#1			// increment counter
264	stp	@x[4],@x[6],[$out,#16]
265	stp	@x[8],@x[10],[$out,#32]
266	stp	@x[12],@x[14],[$out,#48]
267	add	$out,$out,#64
268
269	b.hi	.Loop_outer
270
271	ldp	x19,x20,[x29,#16]
272	add	sp,sp,#64
273	ldp	x21,x22,[x29,#32]
274	ldp	x23,x24,[x29,#48]
275	ldp	x25,x26,[x29,#64]
276	ldp	x27,x28,[x29,#80]
277	ldp	x29,x30,[sp],#96
278	AARCH64_VALIDATE_LINK_REGISTER
279.Labort:
280	ret
281
282.align	4
283.Ltail:
284	add	$len,$len,#64
285.Less_than_64:
286	sub	$out,$out,#1
287	add	$inp,$inp,$len
288	add	$out,$out,$len
289	add	$ctr,sp,$len
290	neg	$len,$len
291
292	add	@x[0],@x[0],@x[1],lsl#32	// pack
293	add	@x[2],@x[2],@x[3],lsl#32
294	add	@x[4],@x[4],@x[5],lsl#32
295	add	@x[6],@x[6],@x[7],lsl#32
296	add	@x[8],@x[8],@x[9],lsl#32
297	add	@x[10],@x[10],@x[11],lsl#32
298	add	@x[12],@x[12],@x[13],lsl#32
299	add	@x[14],@x[14],@x[15],lsl#32
300#ifdef	__ARMEB__
301	rev	@x[0],@x[0]
302	rev	@x[2],@x[2]
303	rev	@x[4],@x[4]
304	rev	@x[6],@x[6]
305	rev	@x[8],@x[8]
306	rev	@x[10],@x[10]
307	rev	@x[12],@x[12]
308	rev	@x[14],@x[14]
309#endif
310	stp	@x[0],@x[2],[sp,#0]
311	stp	@x[4],@x[6],[sp,#16]
312	stp	@x[8],@x[10],[sp,#32]
313	stp	@x[12],@x[14],[sp,#48]
314
315.Loop_tail:
316	ldrb	w10,[$inp,$len]
317	ldrb	w11,[$ctr,$len]
318	add	$len,$len,#1
319	eor	w10,w10,w11
320	strb	w10,[$out,$len]
321	cbnz	$len,.Loop_tail
322
323	stp	xzr,xzr,[sp,#0]
324	stp	xzr,xzr,[sp,#16]
325	stp	xzr,xzr,[sp,#32]
326	stp	xzr,xzr,[sp,#48]
327
328	ldp	x19,x20,[x29,#16]
329	add	sp,sp,#64
330	ldp	x21,x22,[x29,#32]
331	ldp	x23,x24,[x29,#48]
332	ldp	x25,x26,[x29,#64]
333	ldp	x27,x28,[x29,#80]
334	ldp	x29,x30,[sp],#96
335	AARCH64_VALIDATE_LINK_REGISTER
336	ret
337.size	GFp_ChaCha20_ctr32,.-GFp_ChaCha20_ctr32
338___
339
340{{{
341my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
342    map("v$_.4s",(0..7,16..23));
343my (@K)=map("v$_.4s",(24..30));
344my $ONE="v31.4s";
345
346sub NEONROUND {
347my $odd = pop;
348my ($a,$b,$c,$d,$t)=@_;
349
350	(
351	"&add		('$a','$a','$b')",
352	"&eor		('$d','$d','$a')",
353	"&rev32_16	('$d','$d')",		# vrot ($d,16)
354
355	"&add		('$c','$c','$d')",
356	"&eor		('$t','$b','$c')",
357	"&ushr		('$b','$t',20)",
358	"&sli		('$b','$t',12)",
359
360	"&add		('$a','$a','$b')",
361	"&eor		('$t','$d','$a')",
362	"&ushr		('$d','$t',24)",
363	"&sli		('$d','$t',8)",
364
365	"&add		('$c','$c','$d')",
366	"&eor		('$t','$b','$c')",
367	"&ushr		('$b','$t',25)",
368	"&sli		('$b','$t',7)",
369
370	"&ext		('$c','$c','$c',8)",
371	"&ext		('$d','$d','$d',$odd?4:12)",
372	"&ext		('$b','$b','$b',$odd?12:4)"
373	);
374}
375
376$code.=<<___;
377
378.type	ChaCha20_neon,%function
379.align	5
380ChaCha20_neon:
381	AARCH64_SIGN_LINK_REGISTER
382	stp	x29,x30,[sp,#-96]!
383	add	x29,sp,#0
384
385	adrp	@x[0],:pg_hi21:.Lsigma
386	add	@x[0],@x[0],:lo12:.Lsigma
387	stp	x19,x20,[sp,#16]
388	stp	x21,x22,[sp,#32]
389	stp	x23,x24,[sp,#48]
390	stp	x25,x26,[sp,#64]
391	stp	x27,x28,[sp,#80]
392	cmp	$len,#512
393	b.hs	.L512_or_more_neon
394
395	sub	sp,sp,#64
396
397	ldp	@d[0],@d[1],[@x[0]]		// load sigma
398	ld1	{@K[0]},[@x[0]],#16
399	ldp	@d[2],@d[3],[$key]		// load key
400	ldp	@d[4],@d[5],[$key,#16]
401	ld1	{@K[1],@K[2]},[$key]
402	ldp	@d[6],@d[7],[$ctr]		// load counter
403	ld1	{@K[3]},[$ctr]
404	ld1	{$ONE},[@x[0]]
405#ifdef	__ARMEB__
406	rev64	@K[0],@K[0]
407	ror	@d[2],@d[2],#32
408	ror	@d[3],@d[3],#32
409	ror	@d[4],@d[4],#32
410	ror	@d[5],@d[5],#32
411	ror	@d[6],@d[6],#32
412	ror	@d[7],@d[7],#32
413#endif
414	add	@K[3],@K[3],$ONE		// += 1
415	add	@K[4],@K[3],$ONE
416	add	@K[5],@K[4],$ONE
417	shl	$ONE,$ONE,#2			// 1 -> 4
418
419.Loop_outer_neon:
420	mov.32	@x[0],@d[0]			// unpack key block
421	lsr	@x[1],@d[0],#32
422	 mov	$A0,@K[0]
423	mov.32	@x[2],@d[1]
424	lsr	@x[3],@d[1],#32
425	 mov	$A1,@K[0]
426	mov.32	@x[4],@d[2]
427	lsr	@x[5],@d[2],#32
428	 mov	$A2,@K[0]
429	mov.32	@x[6],@d[3]
430	 mov	$B0,@K[1]
431	lsr	@x[7],@d[3],#32
432	 mov	$B1,@K[1]
433	mov.32	@x[8],@d[4]
434	 mov	$B2,@K[1]
435	lsr	@x[9],@d[4],#32
436	 mov	$D0,@K[3]
437	mov.32	@x[10],@d[5]
438	 mov	$D1,@K[4]
439	lsr	@x[11],@d[5],#32
440	 mov	$D2,@K[5]
441	mov.32	@x[12],@d[6]
442	 mov	$C0,@K[2]
443	lsr	@x[13],@d[6],#32
444	 mov	$C1,@K[2]
445	mov.32	@x[14],@d[7]
446	 mov	$C2,@K[2]
447	lsr	@x[15],@d[7],#32
448
449	mov	$ctr,#10
450	subs	$len,$len,#256
451.Loop_neon:
452	sub	$ctr,$ctr,#1
453___
454	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
455	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
456	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
457	my @thread3=&ROUND(0,4,8,12);
458
459	foreach (@thread0) {
460		eval;			eval(shift(@thread3));
461		eval(shift(@thread1));	eval(shift(@thread3));
462		eval(shift(@thread2));	eval(shift(@thread3));
463	}
464
465	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
466	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
467	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
468	@thread3=&ROUND(0,5,10,15);
469
470	foreach (@thread0) {
471		eval;			eval(shift(@thread3));
472		eval(shift(@thread1));	eval(shift(@thread3));
473		eval(shift(@thread2));	eval(shift(@thread3));
474	}
475$code.=<<___;
476	cbnz	$ctr,.Loop_neon
477
478	add.32	@x[0],@x[0],@d[0]		// accumulate key block
479	 add	$A0,$A0,@K[0]
480	add	@x[1],@x[1],@d[0],lsr#32
481	 add	$A1,$A1,@K[0]
482	add.32	@x[2],@x[2],@d[1]
483	 add	$A2,$A2,@K[0]
484	add	@x[3],@x[3],@d[1],lsr#32
485	 add	$C0,$C0,@K[2]
486	add.32	@x[4],@x[4],@d[2]
487	 add	$C1,$C1,@K[2]
488	add	@x[5],@x[5],@d[2],lsr#32
489	 add	$C2,$C2,@K[2]
490	add.32	@x[6],@x[6],@d[3]
491	 add	$D0,$D0,@K[3]
492	add	@x[7],@x[7],@d[3],lsr#32
493	add.32	@x[8],@x[8],@d[4]
494	 add	$D1,$D1,@K[4]
495	add	@x[9],@x[9],@d[4],lsr#32
496	add.32	@x[10],@x[10],@d[5]
497	 add	$D2,$D2,@K[5]
498	add	@x[11],@x[11],@d[5],lsr#32
499	add.32	@x[12],@x[12],@d[6]
500	 add	$B0,$B0,@K[1]
501	add	@x[13],@x[13],@d[6],lsr#32
502	add.32	@x[14],@x[14],@d[7]
503	 add	$B1,$B1,@K[1]
504	add	@x[15],@x[15],@d[7],lsr#32
505	 add	$B2,$B2,@K[1]
506
507	b.lo	.Ltail_neon
508
509	add	@x[0],@x[0],@x[1],lsl#32	// pack
510	add	@x[2],@x[2],@x[3],lsl#32
511	ldp	@x[1],@x[3],[$inp,#0]		// load input
512	add	@x[4],@x[4],@x[5],lsl#32
513	add	@x[6],@x[6],@x[7],lsl#32
514	ldp	@x[5],@x[7],[$inp,#16]
515	add	@x[8],@x[8],@x[9],lsl#32
516	add	@x[10],@x[10],@x[11],lsl#32
517	ldp	@x[9],@x[11],[$inp,#32]
518	add	@x[12],@x[12],@x[13],lsl#32
519	add	@x[14],@x[14],@x[15],lsl#32
520	ldp	@x[13],@x[15],[$inp,#48]
521	add	$inp,$inp,#64
522#ifdef	__ARMEB__
523	rev	@x[0],@x[0]
524	rev	@x[2],@x[2]
525	rev	@x[4],@x[4]
526	rev	@x[6],@x[6]
527	rev	@x[8],@x[8]
528	rev	@x[10],@x[10]
529	rev	@x[12],@x[12]
530	rev	@x[14],@x[14]
531#endif
532	ld1.8	{$T0-$T3},[$inp],#64
533	eor	@x[0],@x[0],@x[1]
534	eor	@x[2],@x[2],@x[3]
535	eor	@x[4],@x[4],@x[5]
536	eor	@x[6],@x[6],@x[7]
537	eor	@x[8],@x[8],@x[9]
538	 eor	$A0,$A0,$T0
539	eor	@x[10],@x[10],@x[11]
540	 eor	$B0,$B0,$T1
541	eor	@x[12],@x[12],@x[13]
542	 eor	$C0,$C0,$T2
543	eor	@x[14],@x[14],@x[15]
544	 eor	$D0,$D0,$T3
545	 ld1.8	{$T0-$T3},[$inp],#64
546
547	stp	@x[0],@x[2],[$out,#0]		// store output
548	 add	@d[6],@d[6],#4			// increment counter
549	stp	@x[4],@x[6],[$out,#16]
550	 add	@K[3],@K[3],$ONE		// += 4
551	stp	@x[8],@x[10],[$out,#32]
552	 add	@K[4],@K[4],$ONE
553	stp	@x[12],@x[14],[$out,#48]
554	 add	@K[5],@K[5],$ONE
555	add	$out,$out,#64
556
557	st1.8	{$A0-$D0},[$out],#64
558	ld1.8	{$A0-$D0},[$inp],#64
559
560	eor	$A1,$A1,$T0
561	eor	$B1,$B1,$T1
562	eor	$C1,$C1,$T2
563	eor	$D1,$D1,$T3
564	st1.8	{$A1-$D1},[$out],#64
565
566	eor	$A2,$A2,$A0
567	eor	$B2,$B2,$B0
568	eor	$C2,$C2,$C0
569	eor	$D2,$D2,$D0
570	st1.8	{$A2-$D2},[$out],#64
571
572	b.hi	.Loop_outer_neon
573
574	ldp	x19,x20,[x29,#16]
575	add	sp,sp,#64
576	ldp	x21,x22,[x29,#32]
577	ldp	x23,x24,[x29,#48]
578	ldp	x25,x26,[x29,#64]
579	ldp	x27,x28,[x29,#80]
580	ldp	x29,x30,[sp],#96
581	AARCH64_VALIDATE_LINK_REGISTER
582	ret
583
584.Ltail_neon:
585	add	$len,$len,#256
586	cmp	$len,#64
587	b.lo	.Less_than_64
588
589	add	@x[0],@x[0],@x[1],lsl#32	// pack
590	add	@x[2],@x[2],@x[3],lsl#32
591	ldp	@x[1],@x[3],[$inp,#0]		// load input
592	add	@x[4],@x[4],@x[5],lsl#32
593	add	@x[6],@x[6],@x[7],lsl#32
594	ldp	@x[5],@x[7],[$inp,#16]
595	add	@x[8],@x[8],@x[9],lsl#32
596	add	@x[10],@x[10],@x[11],lsl#32
597	ldp	@x[9],@x[11],[$inp,#32]
598	add	@x[12],@x[12],@x[13],lsl#32
599	add	@x[14],@x[14],@x[15],lsl#32
600	ldp	@x[13],@x[15],[$inp,#48]
601	add	$inp,$inp,#64
602#ifdef	__ARMEB__
603	rev	@x[0],@x[0]
604	rev	@x[2],@x[2]
605	rev	@x[4],@x[4]
606	rev	@x[6],@x[6]
607	rev	@x[8],@x[8]
608	rev	@x[10],@x[10]
609	rev	@x[12],@x[12]
610	rev	@x[14],@x[14]
611#endif
612	eor	@x[0],@x[0],@x[1]
613	eor	@x[2],@x[2],@x[3]
614	eor	@x[4],@x[4],@x[5]
615	eor	@x[6],@x[6],@x[7]
616	eor	@x[8],@x[8],@x[9]
617	eor	@x[10],@x[10],@x[11]
618	eor	@x[12],@x[12],@x[13]
619	eor	@x[14],@x[14],@x[15]
620
621	stp	@x[0],@x[2],[$out,#0]		// store output
622	 add	@d[6],@d[6],#4			// increment counter
623	stp	@x[4],@x[6],[$out,#16]
624	stp	@x[8],@x[10],[$out,#32]
625	stp	@x[12],@x[14],[$out,#48]
626	add	$out,$out,#64
627	b.eq	.Ldone_neon
628	sub	$len,$len,#64
629	cmp	$len,#64
630	b.lo	.Less_than_128
631
632	ld1.8	{$T0-$T3},[$inp],#64
633	eor	$A0,$A0,$T0
634	eor	$B0,$B0,$T1
635	eor	$C0,$C0,$T2
636	eor	$D0,$D0,$T3
637	st1.8	{$A0-$D0},[$out],#64
638	b.eq	.Ldone_neon
639	sub	$len,$len,#64
640	cmp	$len,#64
641	b.lo	.Less_than_192
642
643	ld1.8	{$T0-$T3},[$inp],#64
644	eor	$A1,$A1,$T0
645	eor	$B1,$B1,$T1
646	eor	$C1,$C1,$T2
647	eor	$D1,$D1,$T3
648	st1.8	{$A1-$D1},[$out],#64
649	b.eq	.Ldone_neon
650	sub	$len,$len,#64
651
652	st1.8	{$A2-$D2},[sp]
653	b	.Last_neon
654
655.Less_than_128:
656	st1.8	{$A0-$D0},[sp]
657	b	.Last_neon
658.Less_than_192:
659	st1.8	{$A1-$D1},[sp]
660	b	.Last_neon
661
662.align	4
663.Last_neon:
664	sub	$out,$out,#1
665	add	$inp,$inp,$len
666	add	$out,$out,$len
667	add	$ctr,sp,$len
668	neg	$len,$len
669
670.Loop_tail_neon:
671	ldrb	w10,[$inp,$len]
672	ldrb	w11,[$ctr,$len]
673	add	$len,$len,#1
674	eor	w10,w10,w11
675	strb	w10,[$out,$len]
676	cbnz	$len,.Loop_tail_neon
677
678	stp	xzr,xzr,[sp,#0]
679	stp	xzr,xzr,[sp,#16]
680	stp	xzr,xzr,[sp,#32]
681	stp	xzr,xzr,[sp,#48]
682
683.Ldone_neon:
684	ldp	x19,x20,[x29,#16]
685	add	sp,sp,#64
686	ldp	x21,x22,[x29,#32]
687	ldp	x23,x24,[x29,#48]
688	ldp	x25,x26,[x29,#64]
689	ldp	x27,x28,[x29,#80]
690	ldp	x29,x30,[sp],#96
691	AARCH64_VALIDATE_LINK_REGISTER
692	ret
693.size	ChaCha20_neon,.-ChaCha20_neon
694___
695{
696my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
697my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
698    $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
699
700$code.=<<___;
701.type	ChaCha20_512_neon,%function
702.align	5
703ChaCha20_512_neon:
704	AARCH64_SIGN_LINK_REGISTER
705	stp	x29,x30,[sp,#-96]!
706	add	x29,sp,#0
707
708	adrp	@x[0],:pg_hi21:.Lsigma
709	add	@x[0],@x[0],:lo12:.Lsigma
710	stp	x19,x20,[sp,#16]
711	stp	x21,x22,[sp,#32]
712	stp	x23,x24,[sp,#48]
713	stp	x25,x26,[sp,#64]
714	stp	x27,x28,[sp,#80]
715
716.L512_or_more_neon:
717	sub	sp,sp,#128+64
718
719	ldp	@d[0],@d[1],[@x[0]]		// load sigma
720	ld1	{@K[0]},[@x[0]],#16
721	ldp	@d[2],@d[3],[$key]		// load key
722	ldp	@d[4],@d[5],[$key,#16]
723	ld1	{@K[1],@K[2]},[$key]
724	ldp	@d[6],@d[7],[$ctr]		// load counter
725	ld1	{@K[3]},[$ctr]
726	ld1	{$ONE},[@x[0]]
727#ifdef	__ARMEB__
728	rev64	@K[0],@K[0]
729	ror	@d[2],@d[2],#32
730	ror	@d[3],@d[3],#32
731	ror	@d[4],@d[4],#32
732	ror	@d[5],@d[5],#32
733	ror	@d[6],@d[6],#32
734	ror	@d[7],@d[7],#32
735#endif
736	add	@K[3],@K[3],$ONE		// += 1
737	stp	@K[0],@K[1],[sp,#0]		// off-load key block, invariant part
738	add	@K[3],@K[3],$ONE		// not typo
739	str	@K[2],[sp,#32]
740	add	@K[4],@K[3],$ONE
741	add	@K[5],@K[4],$ONE
742	add	@K[6],@K[5],$ONE
743	shl	$ONE,$ONE,#2			// 1 -> 4
744
745	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
746	stp	d10,d11,[sp,#128+16]
747	stp	d12,d13,[sp,#128+32]
748	stp	d14,d15,[sp,#128+48]
749
750	sub	$len,$len,#512			// not typo
751
752.Loop_outer_512_neon:
753	 mov	$A0,@K[0]
754	 mov	$A1,@K[0]
755	 mov	$A2,@K[0]
756	 mov	$A3,@K[0]
757	 mov	$A4,@K[0]
758	 mov	$A5,@K[0]
759	 mov	$B0,@K[1]
760	mov.32	@x[0],@d[0]			// unpack key block
761	 mov	$B1,@K[1]
762	lsr	@x[1],@d[0],#32
763	 mov	$B2,@K[1]
764	mov.32	@x[2],@d[1]
765	 mov	$B3,@K[1]
766	lsr	@x[3],@d[1],#32
767	 mov	$B4,@K[1]
768	mov.32	@x[4],@d[2]
769	 mov	$B5,@K[1]
770	lsr	@x[5],@d[2],#32
771	 mov	$D0,@K[3]
772	mov.32	@x[6],@d[3]
773	 mov	$D1,@K[4]
774	lsr	@x[7],@d[3],#32
775	 mov	$D2,@K[5]
776	mov.32	@x[8],@d[4]
777	 mov	$D3,@K[6]
778	lsr	@x[9],@d[4],#32
779	 mov	$C0,@K[2]
780	mov.32	@x[10],@d[5]
781	 mov	$C1,@K[2]
782	lsr	@x[11],@d[5],#32
783	 add	$D4,$D0,$ONE			// +4
784	mov.32	@x[12],@d[6]
785	 add	$D5,$D1,$ONE			// +4
786	lsr	@x[13],@d[6],#32
787	 mov	$C2,@K[2]
788	mov.32	@x[14],@d[7]
789	 mov	$C3,@K[2]
790	lsr	@x[15],@d[7],#32
791	 mov	$C4,@K[2]
792	 stp	@K[3],@K[4],[sp,#48]		// off-load key block, variable part
793	 mov	$C5,@K[2]
794	 str	@K[5],[sp,#80]
795
796	mov	$ctr,#5
797	subs	$len,$len,#512
798.Loop_upper_neon:
799	sub	$ctr,$ctr,#1
800___
801	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
802	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
803	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
804	my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
805	my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
806	my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
807	my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
808	my $diff = ($#thread0+1)*6 - $#thread67 - 1;
809	my $i = 0;
810
811	foreach (@thread0) {
812		eval;			eval(shift(@thread67));
813		eval(shift(@thread1));	eval(shift(@thread67));
814		eval(shift(@thread2));	eval(shift(@thread67));
815		eval(shift(@thread3));	eval(shift(@thread67));
816		eval(shift(@thread4));	eval(shift(@thread67));
817		eval(shift(@thread5));	eval(shift(@thread67));
818	}
819
820	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
821	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
822	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
823	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
824	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
825	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
826	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
827
828	foreach (@thread0) {
829		eval;			eval(shift(@thread67));
830		eval(shift(@thread1));	eval(shift(@thread67));
831		eval(shift(@thread2));	eval(shift(@thread67));
832		eval(shift(@thread3));	eval(shift(@thread67));
833		eval(shift(@thread4));	eval(shift(@thread67));
834		eval(shift(@thread5));	eval(shift(@thread67));
835	}
836$code.=<<___;
837	cbnz	$ctr,.Loop_upper_neon
838
839	add.32	@x[0],@x[0],@d[0]		// accumulate key block
840	add	@x[1],@x[1],@d[0],lsr#32
841	add.32	@x[2],@x[2],@d[1]
842	add	@x[3],@x[3],@d[1],lsr#32
843	add.32	@x[4],@x[4],@d[2]
844	add	@x[5],@x[5],@d[2],lsr#32
845	add.32	@x[6],@x[6],@d[3]
846	add	@x[7],@x[7],@d[3],lsr#32
847	add.32	@x[8],@x[8],@d[4]
848	add	@x[9],@x[9],@d[4],lsr#32
849	add.32	@x[10],@x[10],@d[5]
850	add	@x[11],@x[11],@d[5],lsr#32
851	add.32	@x[12],@x[12],@d[6]
852	add	@x[13],@x[13],@d[6],lsr#32
853	add.32	@x[14],@x[14],@d[7]
854	add	@x[15],@x[15],@d[7],lsr#32
855
856	add	@x[0],@x[0],@x[1],lsl#32	// pack
857	add	@x[2],@x[2],@x[3],lsl#32
858	ldp	@x[1],@x[3],[$inp,#0]		// load input
859	add	@x[4],@x[4],@x[5],lsl#32
860	add	@x[6],@x[6],@x[7],lsl#32
861	ldp	@x[5],@x[7],[$inp,#16]
862	add	@x[8],@x[8],@x[9],lsl#32
863	add	@x[10],@x[10],@x[11],lsl#32
864	ldp	@x[9],@x[11],[$inp,#32]
865	add	@x[12],@x[12],@x[13],lsl#32
866	add	@x[14],@x[14],@x[15],lsl#32
867	ldp	@x[13],@x[15],[$inp,#48]
868	add	$inp,$inp,#64
869#ifdef	__ARMEB__
870	rev	@x[0],@x[0]
871	rev	@x[2],@x[2]
872	rev	@x[4],@x[4]
873	rev	@x[6],@x[6]
874	rev	@x[8],@x[8]
875	rev	@x[10],@x[10]
876	rev	@x[12],@x[12]
877	rev	@x[14],@x[14]
878#endif
879	eor	@x[0],@x[0],@x[1]
880	eor	@x[2],@x[2],@x[3]
881	eor	@x[4],@x[4],@x[5]
882	eor	@x[6],@x[6],@x[7]
883	eor	@x[8],@x[8],@x[9]
884	eor	@x[10],@x[10],@x[11]
885	eor	@x[12],@x[12],@x[13]
886	eor	@x[14],@x[14],@x[15]
887
888	 stp	@x[0],@x[2],[$out,#0]		// store output
889	 add	@d[6],@d[6],#1			// increment counter
890	mov.32	@x[0],@d[0]			// unpack key block
891	lsr	@x[1],@d[0],#32
892	 stp	@x[4],@x[6],[$out,#16]
893	mov.32	@x[2],@d[1]
894	lsr	@x[3],@d[1],#32
895	 stp	@x[8],@x[10],[$out,#32]
896	mov.32	@x[4],@d[2]
897	lsr	@x[5],@d[2],#32
898	 stp	@x[12],@x[14],[$out,#48]
899	 add	$out,$out,#64
900	mov.32	@x[6],@d[3]
901	lsr	@x[7],@d[3],#32
902	mov.32	@x[8],@d[4]
903	lsr	@x[9],@d[4],#32
904	mov.32	@x[10],@d[5]
905	lsr	@x[11],@d[5],#32
906	mov.32	@x[12],@d[6]
907	lsr	@x[13],@d[6],#32
908	mov.32	@x[14],@d[7]
909	lsr	@x[15],@d[7],#32
910
911	mov	$ctr,#5
912.Loop_lower_neon:
913	sub	$ctr,$ctr,#1
914___
915	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
916	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
917	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
918	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
919	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
920	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
921	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
922
923	foreach (@thread0) {
924		eval;			eval(shift(@thread67));
925		eval(shift(@thread1));	eval(shift(@thread67));
926		eval(shift(@thread2));	eval(shift(@thread67));
927		eval(shift(@thread3));	eval(shift(@thread67));
928		eval(shift(@thread4));	eval(shift(@thread67));
929		eval(shift(@thread5));	eval(shift(@thread67));
930	}
931
932	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
933	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
934	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
935	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
936	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
937	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
938	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
939
940	foreach (@thread0) {
941		eval;			eval(shift(@thread67));
942		eval(shift(@thread1));	eval(shift(@thread67));
943		eval(shift(@thread2));	eval(shift(@thread67));
944		eval(shift(@thread3));	eval(shift(@thread67));
945		eval(shift(@thread4));	eval(shift(@thread67));
946		eval(shift(@thread5));	eval(shift(@thread67));
947	}
948$code.=<<___;
949	cbnz	$ctr,.Loop_lower_neon
950
951	add.32	@x[0],@x[0],@d[0]		// accumulate key block
952	 ldp	@K[0],@K[1],[sp,#0]
953	add	@x[1],@x[1],@d[0],lsr#32
954	 ldp	@K[2],@K[3],[sp,#32]
955	add.32	@x[2],@x[2],@d[1]
956	 ldp	@K[4],@K[5],[sp,#64]
957	add	@x[3],@x[3],@d[1],lsr#32
958	 add	$A0,$A0,@K[0]
959	add.32	@x[4],@x[4],@d[2]
960	 add	$A1,$A1,@K[0]
961	add	@x[5],@x[5],@d[2],lsr#32
962	 add	$A2,$A2,@K[0]
963	add.32	@x[6],@x[6],@d[3]
964	 add	$A3,$A3,@K[0]
965	add	@x[7],@x[7],@d[3],lsr#32
966	 add	$A4,$A4,@K[0]
967	add.32	@x[8],@x[8],@d[4]
968	 add	$A5,$A5,@K[0]
969	add	@x[9],@x[9],@d[4],lsr#32
970	 add	$C0,$C0,@K[2]
971	add.32	@x[10],@x[10],@d[5]
972	 add	$C1,$C1,@K[2]
973	add	@x[11],@x[11],@d[5],lsr#32
974	 add	$C2,$C2,@K[2]
975	add.32	@x[12],@x[12],@d[6]
976	 add	$C3,$C3,@K[2]
977	add	@x[13],@x[13],@d[6],lsr#32
978	 add	$C4,$C4,@K[2]
979	add.32	@x[14],@x[14],@d[7]
980	 add	$C5,$C5,@K[2]
981	add	@x[15],@x[15],@d[7],lsr#32
982	 add	$D4,$D4,$ONE			// +4
983	add	@x[0],@x[0],@x[1],lsl#32	// pack
984	 add	$D5,$D5,$ONE			// +4
985	add	@x[2],@x[2],@x[3],lsl#32
986	 add	$D0,$D0,@K[3]
987	ldp	@x[1],@x[3],[$inp,#0]		// load input
988	 add	$D1,$D1,@K[4]
989	add	@x[4],@x[4],@x[5],lsl#32
990	 add	$D2,$D2,@K[5]
991	add	@x[6],@x[6],@x[7],lsl#32
992	 add	$D3,$D3,@K[6]
993	ldp	@x[5],@x[7],[$inp,#16]
994	 add	$D4,$D4,@K[3]
995	add	@x[8],@x[8],@x[9],lsl#32
996	 add	$D5,$D5,@K[4]
997	add	@x[10],@x[10],@x[11],lsl#32
998	 add	$B0,$B0,@K[1]
999	ldp	@x[9],@x[11],[$inp,#32]
1000	 add	$B1,$B1,@K[1]
1001	add	@x[12],@x[12],@x[13],lsl#32
1002	 add	$B2,$B2,@K[1]
1003	add	@x[14],@x[14],@x[15],lsl#32
1004	 add	$B3,$B3,@K[1]
1005	ldp	@x[13],@x[15],[$inp,#48]
1006	 add	$B4,$B4,@K[1]
1007	add	$inp,$inp,#64
1008	 add	$B5,$B5,@K[1]
1009
1010#ifdef	__ARMEB__
1011	rev	@x[0],@x[0]
1012	rev	@x[2],@x[2]
1013	rev	@x[4],@x[4]
1014	rev	@x[6],@x[6]
1015	rev	@x[8],@x[8]
1016	rev	@x[10],@x[10]
1017	rev	@x[12],@x[12]
1018	rev	@x[14],@x[14]
1019#endif
1020	ld1.8	{$T0-$T3},[$inp],#64
1021	eor	@x[0],@x[0],@x[1]
1022	eor	@x[2],@x[2],@x[3]
1023	eor	@x[4],@x[4],@x[5]
1024	eor	@x[6],@x[6],@x[7]
1025	eor	@x[8],@x[8],@x[9]
1026	 eor	$A0,$A0,$T0
1027	eor	@x[10],@x[10],@x[11]
1028	 eor	$B0,$B0,$T1
1029	eor	@x[12],@x[12],@x[13]
1030	 eor	$C0,$C0,$T2
1031	eor	@x[14],@x[14],@x[15]
1032	 eor	$D0,$D0,$T3
1033	 ld1.8	{$T0-$T3},[$inp],#64
1034
1035	stp	@x[0],@x[2],[$out,#0]		// store output
1036	 add	@d[6],@d[6],#7			// increment counter
1037	stp	@x[4],@x[6],[$out,#16]
1038	stp	@x[8],@x[10],[$out,#32]
1039	stp	@x[12],@x[14],[$out,#48]
1040	add	$out,$out,#64
1041	st1.8	{$A0-$D0},[$out],#64
1042
1043	ld1.8	{$A0-$D0},[$inp],#64
1044	eor	$A1,$A1,$T0
1045	eor	$B1,$B1,$T1
1046	eor	$C1,$C1,$T2
1047	eor	$D1,$D1,$T3
1048	st1.8	{$A1-$D1},[$out],#64
1049
1050	ld1.8	{$A1-$D1},[$inp],#64
1051	eor	$A2,$A2,$A0
1052	 ldp	@K[0],@K[1],[sp,#0]
1053	eor	$B2,$B2,$B0
1054	 ldp	@K[2],@K[3],[sp,#32]
1055	eor	$C2,$C2,$C0
1056	eor	$D2,$D2,$D0
1057	st1.8	{$A2-$D2},[$out],#64
1058
1059	ld1.8	{$A2-$D2},[$inp],#64
1060	eor	$A3,$A3,$A1
1061	eor	$B3,$B3,$B1
1062	eor	$C3,$C3,$C1
1063	eor	$D3,$D3,$D1
1064	st1.8	{$A3-$D3},[$out],#64
1065
1066	ld1.8	{$A3-$D3},[$inp],#64
1067	eor	$A4,$A4,$A2
1068	eor	$B4,$B4,$B2
1069	eor	$C4,$C4,$C2
1070	eor	$D4,$D4,$D2
1071	st1.8	{$A4-$D4},[$out],#64
1072
1073	shl	$A0,$ONE,#1			// 4 -> 8
1074	eor	$A5,$A5,$A3
1075	eor	$B5,$B5,$B3
1076	eor	$C5,$C5,$C3
1077	eor	$D5,$D5,$D3
1078	st1.8	{$A5-$D5},[$out],#64
1079
1080	add	@K[3],@K[3],$A0			// += 8
1081	add	@K[4],@K[4],$A0
1082	add	@K[5],@K[5],$A0
1083	add	@K[6],@K[6],$A0
1084
1085	b.hs	.Loop_outer_512_neon
1086
1087	adds	$len,$len,#512
1088	ushr	$A0,$ONE,#2			// 4 -> 1
1089
1090	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1091	ldp	d10,d11,[sp,#128+16]
1092	ldp	d12,d13,[sp,#128+32]
1093	ldp	d14,d15,[sp,#128+48]
1094
1095	stp	@K[0],$ONE,[sp,#0]		// wipe off-load area
1096	stp	@K[0],$ONE,[sp,#32]
1097	stp	@K[0],$ONE,[sp,#64]
1098
1099	b.eq	.Ldone_512_neon
1100
1101	cmp	$len,#192
1102	sub	@K[3],@K[3],$A0			// -= 1
1103	sub	@K[4],@K[4],$A0
1104	sub	@K[5],@K[5],$A0
1105	add	sp,sp,#128
1106	b.hs	.Loop_outer_neon
1107
1108	eor	@K[1],@K[1],@K[1]
1109	eor	@K[2],@K[2],@K[2]
1110	eor	@K[3],@K[3],@K[3]
1111	eor	@K[4],@K[4],@K[4]
1112	eor	@K[5],@K[5],@K[5]
1113	eor	@K[6],@K[6],@K[6]
1114	b	.Loop_outer
1115
1116.Ldone_512_neon:
1117	ldp	x19,x20,[x29,#16]
1118	add	sp,sp,#128+64
1119	ldp	x21,x22,[x29,#32]
1120	ldp	x23,x24,[x29,#48]
1121	ldp	x25,x26,[x29,#64]
1122	ldp	x27,x28,[x29,#80]
1123	ldp	x29,x30,[sp],#96
1124	AARCH64_VALIDATE_LINK_REGISTER
1125	ret
1126.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1127___
1128}
1129}}}
1130
1131foreach (split("\n",$code)) {
1132	s/\`([^\`]*)\`/eval $1/geo;
1133
1134	(s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1))	or
1135	(m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1))	or
1136	(s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1))	or
1137	(m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1))	or
1138	(s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1139
1140	#s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1141
1142	print $_,"\n";
1143}
1144close STDOUT or die "error closing STDOUT";	# flush
1145