1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# June 2015
18#
19# ChaCha20 for ARMv8.
20#
21# Performance in cycles per byte out of large buffer.
22#
23#			IALU/gcc-4.9    3xNEON+1xIALU	6xNEON+2xIALU
24#
25# Apple A7		5.50/+49%       3.33            1.70
26# Cortex-A53		8.40/+80%       4.72		4.72(*)
27# Cortex-A57		8.06/+43%       4.90            4.43(**)
28# Denver		4.50/+82%       2.63		2.67(*)
29# X-Gene		9.50/+46%       8.82		8.89(*)
30# Mongoose		8.00/+44%	3.64		3.25
31# Kryo			8.17/+50%	4.83		4.65
32#
33# (*)	it's expected that doubling interleave factor doesn't help
34#	all processors, only those with higher NEON latency and
35#	higher instruction issue rate;
36# (**)	expected improvement was actually higher;
37
38$flavour=shift;
39$output=shift;
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
44die "can't locate arm-xlate.pl";
45
46open OUT,"| \"$^X\" $xlate $flavour $output";
47*STDOUT=*OUT;
48
49sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
50{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
51  my $arg = pop;
52    $arg = "#$arg" if ($arg*1 eq $arg);
53    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
54}
55
56my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
57
58my @x=map("x$_",(5..17,19..21));
59my @d=map("x$_",(22..28,30));
60
61sub ROUND {
62my ($a0,$b0,$c0,$d0)=@_;
63my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
64my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
65my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
66
67    (
68	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
69	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
70	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
71	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
72	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
73	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
74	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
75	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
76	"&ror_32	(@x[$d0],@x[$d0],16)",
77	 "&ror_32	(@x[$d1],@x[$d1],16)",
78	  "&ror_32	(@x[$d2],@x[$d2],16)",
79	   "&ror_32	(@x[$d3],@x[$d3],16)",
80
81	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
82	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
83	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
84	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
85	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
86	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
87	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
88	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
89	"&ror_32	(@x[$b0],@x[$b0],20)",
90	 "&ror_32	(@x[$b1],@x[$b1],20)",
91	  "&ror_32	(@x[$b2],@x[$b2],20)",
92	   "&ror_32	(@x[$b3],@x[$b3],20)",
93
94	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
95	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
96	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
97	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
98	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
99	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
100	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
101	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
102	"&ror_32	(@x[$d0],@x[$d0],24)",
103	 "&ror_32	(@x[$d1],@x[$d1],24)",
104	  "&ror_32	(@x[$d2],@x[$d2],24)",
105	   "&ror_32	(@x[$d3],@x[$d3],24)",
106
107	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
108	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
109	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
110	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
111	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
112	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
113	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
114	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
115	"&ror_32	(@x[$b0],@x[$b0],25)",
116	 "&ror_32	(@x[$b1],@x[$b1],25)",
117	  "&ror_32	(@x[$b2],@x[$b2],25)",
118	   "&ror_32	(@x[$b3],@x[$b3],25)"
119    );
120}
121
122$code.=<<___;
123#include "arm_arch.h"
124
125.text
126
127.extern	OPENSSL_armcap_P
128.hidden	OPENSSL_armcap_P
129
130.align	5
131.Lsigma:
132.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
133.Lone:
134.long	1,0,0,0
135.LOPENSSL_armcap_P:
136#ifdef	__ILP32__
137.long	OPENSSL_armcap_P-.
138#else
139.quad	OPENSSL_armcap_P-.
140#endif
141.asciz	"ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
142
143.globl	ChaCha20_ctr32
144.type	ChaCha20_ctr32,%function
145.align	5
146ChaCha20_ctr32:
147	cbz	$len,.Labort
148	adr	@x[0],.LOPENSSL_armcap_P
149	cmp	$len,#192
150	b.lo	.Lshort
151#ifdef	__ILP32__
152	ldrsw	@x[1],[@x[0]]
153#else
154	ldr	@x[1],[@x[0]]
155#endif
156	ldr	w17,[@x[1],@x[0]]
157	tst	w17,#ARMV7_NEON
158	b.ne	ChaCha20_neon
159
160.Lshort:
161	.inst	0xd503233f			// paciasp
162	stp	x29,x30,[sp,#-96]!
163	add	x29,sp,#0
164
165	adr	@x[0],.Lsigma
166	stp	x19,x20,[sp,#16]
167	stp	x21,x22,[sp,#32]
168	stp	x23,x24,[sp,#48]
169	stp	x25,x26,[sp,#64]
170	stp	x27,x28,[sp,#80]
171	sub	sp,sp,#64
172
173	ldp	@d[0],@d[1],[@x[0]]		// load sigma
174	ldp	@d[2],@d[3],[$key]		// load key
175	ldp	@d[4],@d[5],[$key,#16]
176	ldp	@d[6],@d[7],[$ctr]		// load counter
177#ifdef	__ARMEB__
178	ror	@d[2],@d[2],#32
179	ror	@d[3],@d[3],#32
180	ror	@d[4],@d[4],#32
181	ror	@d[5],@d[5],#32
182	ror	@d[6],@d[6],#32
183	ror	@d[7],@d[7],#32
184#endif
185
186.Loop_outer:
187	mov.32	@x[0],@d[0]			// unpack key block
188	lsr	@x[1],@d[0],#32
189	mov.32	@x[2],@d[1]
190	lsr	@x[3],@d[1],#32
191	mov.32	@x[4],@d[2]
192	lsr	@x[5],@d[2],#32
193	mov.32	@x[6],@d[3]
194	lsr	@x[7],@d[3],#32
195	mov.32	@x[8],@d[4]
196	lsr	@x[9],@d[4],#32
197	mov.32	@x[10],@d[5]
198	lsr	@x[11],@d[5],#32
199	mov.32	@x[12],@d[6]
200	lsr	@x[13],@d[6],#32
201	mov.32	@x[14],@d[7]
202	lsr	@x[15],@d[7],#32
203
204	mov	$ctr,#10
205	subs	$len,$len,#64
206.Loop:
207	sub	$ctr,$ctr,#1
208___
209	foreach (&ROUND(0, 4, 8,12)) { eval; }
210	foreach (&ROUND(0, 5,10,15)) { eval; }
211$code.=<<___;
212	cbnz	$ctr,.Loop
213
214	add.32	@x[0],@x[0],@d[0]		// accumulate key block
215	add	@x[1],@x[1],@d[0],lsr#32
216	add.32	@x[2],@x[2],@d[1]
217	add	@x[3],@x[3],@d[1],lsr#32
218	add.32	@x[4],@x[4],@d[2]
219	add	@x[5],@x[5],@d[2],lsr#32
220	add.32	@x[6],@x[6],@d[3]
221	add	@x[7],@x[7],@d[3],lsr#32
222	add.32	@x[8],@x[8],@d[4]
223	add	@x[9],@x[9],@d[4],lsr#32
224	add.32	@x[10],@x[10],@d[5]
225	add	@x[11],@x[11],@d[5],lsr#32
226	add.32	@x[12],@x[12],@d[6]
227	add	@x[13],@x[13],@d[6],lsr#32
228	add.32	@x[14],@x[14],@d[7]
229	add	@x[15],@x[15],@d[7],lsr#32
230
231	b.lo	.Ltail
232
233	add	@x[0],@x[0],@x[1],lsl#32	// pack
234	add	@x[2],@x[2],@x[3],lsl#32
235	ldp	@x[1],@x[3],[$inp,#0]		// load input
236	add	@x[4],@x[4],@x[5],lsl#32
237	add	@x[6],@x[6],@x[7],lsl#32
238	ldp	@x[5],@x[7],[$inp,#16]
239	add	@x[8],@x[8],@x[9],lsl#32
240	add	@x[10],@x[10],@x[11],lsl#32
241	ldp	@x[9],@x[11],[$inp,#32]
242	add	@x[12],@x[12],@x[13],lsl#32
243	add	@x[14],@x[14],@x[15],lsl#32
244	ldp	@x[13],@x[15],[$inp,#48]
245	add	$inp,$inp,#64
246#ifdef	__ARMEB__
247	rev	@x[0],@x[0]
248	rev	@x[2],@x[2]
249	rev	@x[4],@x[4]
250	rev	@x[6],@x[6]
251	rev	@x[8],@x[8]
252	rev	@x[10],@x[10]
253	rev	@x[12],@x[12]
254	rev	@x[14],@x[14]
255#endif
256	eor	@x[0],@x[0],@x[1]
257	eor	@x[2],@x[2],@x[3]
258	eor	@x[4],@x[4],@x[5]
259	eor	@x[6],@x[6],@x[7]
260	eor	@x[8],@x[8],@x[9]
261	eor	@x[10],@x[10],@x[11]
262	eor	@x[12],@x[12],@x[13]
263	eor	@x[14],@x[14],@x[15]
264
265	stp	@x[0],@x[2],[$out,#0]		// store output
266	 add	@d[6],@d[6],#1			// increment counter
267	stp	@x[4],@x[6],[$out,#16]
268	stp	@x[8],@x[10],[$out,#32]
269	stp	@x[12],@x[14],[$out,#48]
270	add	$out,$out,#64
271
272	b.hi	.Loop_outer
273
274	ldp	x19,x20,[x29,#16]
275	add	sp,sp,#64
276	ldp	x21,x22,[x29,#32]
277	ldp	x23,x24,[x29,#48]
278	ldp	x25,x26,[x29,#64]
279	ldp	x27,x28,[x29,#80]
280	ldp	x29,x30,[sp],#96
281	.inst	0xd50323bf			// autiasp
282.Labort:
283	ret
284
285.align	4
286.Ltail:
287	add	$len,$len,#64
288.Less_than_64:
289	sub	$out,$out,#1
290	add	$inp,$inp,$len
291	add	$out,$out,$len
292	add	$ctr,sp,$len
293	neg	$len,$len
294
295	add	@x[0],@x[0],@x[1],lsl#32	// pack
296	add	@x[2],@x[2],@x[3],lsl#32
297	add	@x[4],@x[4],@x[5],lsl#32
298	add	@x[6],@x[6],@x[7],lsl#32
299	add	@x[8],@x[8],@x[9],lsl#32
300	add	@x[10],@x[10],@x[11],lsl#32
301	add	@x[12],@x[12],@x[13],lsl#32
302	add	@x[14],@x[14],@x[15],lsl#32
303#ifdef	__ARMEB__
304	rev	@x[0],@x[0]
305	rev	@x[2],@x[2]
306	rev	@x[4],@x[4]
307	rev	@x[6],@x[6]
308	rev	@x[8],@x[8]
309	rev	@x[10],@x[10]
310	rev	@x[12],@x[12]
311	rev	@x[14],@x[14]
312#endif
313	stp	@x[0],@x[2],[sp,#0]
314	stp	@x[4],@x[6],[sp,#16]
315	stp	@x[8],@x[10],[sp,#32]
316	stp	@x[12],@x[14],[sp,#48]
317
318.Loop_tail:
319	ldrb	w10,[$inp,$len]
320	ldrb	w11,[$ctr,$len]
321	add	$len,$len,#1
322	eor	w10,w10,w11
323	strb	w10,[$out,$len]
324	cbnz	$len,.Loop_tail
325
326	stp	xzr,xzr,[sp,#0]
327	stp	xzr,xzr,[sp,#16]
328	stp	xzr,xzr,[sp,#32]
329	stp	xzr,xzr,[sp,#48]
330
331	ldp	x19,x20,[x29,#16]
332	add	sp,sp,#64
333	ldp	x21,x22,[x29,#32]
334	ldp	x23,x24,[x29,#48]
335	ldp	x25,x26,[x29,#64]
336	ldp	x27,x28,[x29,#80]
337	ldp	x29,x30,[sp],#96
338	.inst	0xd50323bf			// autiasp
339	ret
340.size	ChaCha20_ctr32,.-ChaCha20_ctr32
341___
342
343{{{
344my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
345    map("v$_.4s",(0..7,16..23));
346my (@K)=map("v$_.4s",(24..30));
347my $ONE="v31.4s";
348
349sub NEONROUND {
350my $odd = pop;
351my ($a,$b,$c,$d,$t)=@_;
352
353	(
354	"&add		('$a','$a','$b')",
355	"&eor		('$d','$d','$a')",
356	"&rev32_16	('$d','$d')",		# vrot ($d,16)
357
358	"&add		('$c','$c','$d')",
359	"&eor		('$t','$b','$c')",
360	"&ushr		('$b','$t',20)",
361	"&sli		('$b','$t',12)",
362
363	"&add		('$a','$a','$b')",
364	"&eor		('$t','$d','$a')",
365	"&ushr		('$d','$t',24)",
366	"&sli		('$d','$t',8)",
367
368	"&add		('$c','$c','$d')",
369	"&eor		('$t','$b','$c')",
370	"&ushr		('$b','$t',25)",
371	"&sli		('$b','$t',7)",
372
373	"&ext		('$c','$c','$c',8)",
374	"&ext		('$d','$d','$d',$odd?4:12)",
375	"&ext		('$b','$b','$b',$odd?12:4)"
376	);
377}
378
379$code.=<<___;
380
381.type	ChaCha20_neon,%function
382.align	5
383ChaCha20_neon:
384	.inst	0xd503233f			// paciasp
385	stp	x29,x30,[sp,#-96]!
386	add	x29,sp,#0
387
388	adr	@x[0],.Lsigma
389	stp	x19,x20,[sp,#16]
390	stp	x21,x22,[sp,#32]
391	stp	x23,x24,[sp,#48]
392	stp	x25,x26,[sp,#64]
393	stp	x27,x28,[sp,#80]
394	cmp	$len,#512
395	b.hs	.L512_or_more_neon
396
397	sub	sp,sp,#64
398
399	ldp	@d[0],@d[1],[@x[0]]		// load sigma
400	ld1	{@K[0]},[@x[0]],#16
401	ldp	@d[2],@d[3],[$key]		// load key
402	ldp	@d[4],@d[5],[$key,#16]
403	ld1	{@K[1],@K[2]},[$key]
404	ldp	@d[6],@d[7],[$ctr]		// load counter
405	ld1	{@K[3]},[$ctr]
406	ld1	{$ONE},[@x[0]]
407#ifdef	__ARMEB__
408	rev64	@K[0],@K[0]
409	ror	@d[2],@d[2],#32
410	ror	@d[3],@d[3],#32
411	ror	@d[4],@d[4],#32
412	ror	@d[5],@d[5],#32
413	ror	@d[6],@d[6],#32
414	ror	@d[7],@d[7],#32
415#endif
416	add	@K[3],@K[3],$ONE		// += 1
417	add	@K[4],@K[3],$ONE
418	add	@K[5],@K[4],$ONE
419	shl	$ONE,$ONE,#2			// 1 -> 4
420
421.Loop_outer_neon:
422	mov.32	@x[0],@d[0]			// unpack key block
423	lsr	@x[1],@d[0],#32
424	 mov	$A0,@K[0]
425	mov.32	@x[2],@d[1]
426	lsr	@x[3],@d[1],#32
427	 mov	$A1,@K[0]
428	mov.32	@x[4],@d[2]
429	lsr	@x[5],@d[2],#32
430	 mov	$A2,@K[0]
431	mov.32	@x[6],@d[3]
432	 mov	$B0,@K[1]
433	lsr	@x[7],@d[3],#32
434	 mov	$B1,@K[1]
435	mov.32	@x[8],@d[4]
436	 mov	$B2,@K[1]
437	lsr	@x[9],@d[4],#32
438	 mov	$D0,@K[3]
439	mov.32	@x[10],@d[5]
440	 mov	$D1,@K[4]
441	lsr	@x[11],@d[5],#32
442	 mov	$D2,@K[5]
443	mov.32	@x[12],@d[6]
444	 mov	$C0,@K[2]
445	lsr	@x[13],@d[6],#32
446	 mov	$C1,@K[2]
447	mov.32	@x[14],@d[7]
448	 mov	$C2,@K[2]
449	lsr	@x[15],@d[7],#32
450
451	mov	$ctr,#10
452	subs	$len,$len,#256
453.Loop_neon:
454	sub	$ctr,$ctr,#1
455___
456	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
457	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
458	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
459	my @thread3=&ROUND(0,4,8,12);
460
461	foreach (@thread0) {
462		eval;			eval(shift(@thread3));
463		eval(shift(@thread1));	eval(shift(@thread3));
464		eval(shift(@thread2));	eval(shift(@thread3));
465	}
466
467	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
468	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
469	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
470	@thread3=&ROUND(0,5,10,15);
471
472	foreach (@thread0) {
473		eval;			eval(shift(@thread3));
474		eval(shift(@thread1));	eval(shift(@thread3));
475		eval(shift(@thread2));	eval(shift(@thread3));
476	}
477$code.=<<___;
478	cbnz	$ctr,.Loop_neon
479
480	add.32	@x[0],@x[0],@d[0]		// accumulate key block
481	 add	$A0,$A0,@K[0]
482	add	@x[1],@x[1],@d[0],lsr#32
483	 add	$A1,$A1,@K[0]
484	add.32	@x[2],@x[2],@d[1]
485	 add	$A2,$A2,@K[0]
486	add	@x[3],@x[3],@d[1],lsr#32
487	 add	$C0,$C0,@K[2]
488	add.32	@x[4],@x[4],@d[2]
489	 add	$C1,$C1,@K[2]
490	add	@x[5],@x[5],@d[2],lsr#32
491	 add	$C2,$C2,@K[2]
492	add.32	@x[6],@x[6],@d[3]
493	 add	$D0,$D0,@K[3]
494	add	@x[7],@x[7],@d[3],lsr#32
495	add.32	@x[8],@x[8],@d[4]
496	 add	$D1,$D1,@K[4]
497	add	@x[9],@x[9],@d[4],lsr#32
498	add.32	@x[10],@x[10],@d[5]
499	 add	$D2,$D2,@K[5]
500	add	@x[11],@x[11],@d[5],lsr#32
501	add.32	@x[12],@x[12],@d[6]
502	 add	$B0,$B0,@K[1]
503	add	@x[13],@x[13],@d[6],lsr#32
504	add.32	@x[14],@x[14],@d[7]
505	 add	$B1,$B1,@K[1]
506	add	@x[15],@x[15],@d[7],lsr#32
507	 add	$B2,$B2,@K[1]
508
509	b.lo	.Ltail_neon
510
511	add	@x[0],@x[0],@x[1],lsl#32	// pack
512	add	@x[2],@x[2],@x[3],lsl#32
513	ldp	@x[1],@x[3],[$inp,#0]		// load input
514	add	@x[4],@x[4],@x[5],lsl#32
515	add	@x[6],@x[6],@x[7],lsl#32
516	ldp	@x[5],@x[7],[$inp,#16]
517	add	@x[8],@x[8],@x[9],lsl#32
518	add	@x[10],@x[10],@x[11],lsl#32
519	ldp	@x[9],@x[11],[$inp,#32]
520	add	@x[12],@x[12],@x[13],lsl#32
521	add	@x[14],@x[14],@x[15],lsl#32
522	ldp	@x[13],@x[15],[$inp,#48]
523	add	$inp,$inp,#64
524#ifdef	__ARMEB__
525	rev	@x[0],@x[0]
526	rev	@x[2],@x[2]
527	rev	@x[4],@x[4]
528	rev	@x[6],@x[6]
529	rev	@x[8],@x[8]
530	rev	@x[10],@x[10]
531	rev	@x[12],@x[12]
532	rev	@x[14],@x[14]
533#endif
534	ld1.8	{$T0-$T3},[$inp],#64
535	eor	@x[0],@x[0],@x[1]
536	eor	@x[2],@x[2],@x[3]
537	eor	@x[4],@x[4],@x[5]
538	eor	@x[6],@x[6],@x[7]
539	eor	@x[8],@x[8],@x[9]
540	 eor	$A0,$A0,$T0
541	eor	@x[10],@x[10],@x[11]
542	 eor	$B0,$B0,$T1
543	eor	@x[12],@x[12],@x[13]
544	 eor	$C0,$C0,$T2
545	eor	@x[14],@x[14],@x[15]
546	 eor	$D0,$D0,$T3
547	 ld1.8	{$T0-$T3},[$inp],#64
548
549	stp	@x[0],@x[2],[$out,#0]		// store output
550	 add	@d[6],@d[6],#4			// increment counter
551	stp	@x[4],@x[6],[$out,#16]
552	 add	@K[3],@K[3],$ONE		// += 4
553	stp	@x[8],@x[10],[$out,#32]
554	 add	@K[4],@K[4],$ONE
555	stp	@x[12],@x[14],[$out,#48]
556	 add	@K[5],@K[5],$ONE
557	add	$out,$out,#64
558
559	st1.8	{$A0-$D0},[$out],#64
560	ld1.8	{$A0-$D0},[$inp],#64
561
562	eor	$A1,$A1,$T0
563	eor	$B1,$B1,$T1
564	eor	$C1,$C1,$T2
565	eor	$D1,$D1,$T3
566	st1.8	{$A1-$D1},[$out],#64
567
568	eor	$A2,$A2,$A0
569	eor	$B2,$B2,$B0
570	eor	$C2,$C2,$C0
571	eor	$D2,$D2,$D0
572	st1.8	{$A2-$D2},[$out],#64
573
574	b.hi	.Loop_outer_neon
575
576	ldp	x19,x20,[x29,#16]
577	add	sp,sp,#64
578	ldp	x21,x22,[x29,#32]
579	ldp	x23,x24,[x29,#48]
580	ldp	x25,x26,[x29,#64]
581	ldp	x27,x28,[x29,#80]
582	ldp	x29,x30,[sp],#96
583	.inst	0xd50323bf			// autiasp
584	ret
585
586.Ltail_neon:
587	add	$len,$len,#256
588	cmp	$len,#64
589	b.lo	.Less_than_64
590
591	add	@x[0],@x[0],@x[1],lsl#32	// pack
592	add	@x[2],@x[2],@x[3],lsl#32
593	ldp	@x[1],@x[3],[$inp,#0]		// load input
594	add	@x[4],@x[4],@x[5],lsl#32
595	add	@x[6],@x[6],@x[7],lsl#32
596	ldp	@x[5],@x[7],[$inp,#16]
597	add	@x[8],@x[8],@x[9],lsl#32
598	add	@x[10],@x[10],@x[11],lsl#32
599	ldp	@x[9],@x[11],[$inp,#32]
600	add	@x[12],@x[12],@x[13],lsl#32
601	add	@x[14],@x[14],@x[15],lsl#32
602	ldp	@x[13],@x[15],[$inp,#48]
603	add	$inp,$inp,#64
604#ifdef	__ARMEB__
605	rev	@x[0],@x[0]
606	rev	@x[2],@x[2]
607	rev	@x[4],@x[4]
608	rev	@x[6],@x[6]
609	rev	@x[8],@x[8]
610	rev	@x[10],@x[10]
611	rev	@x[12],@x[12]
612	rev	@x[14],@x[14]
613#endif
614	eor	@x[0],@x[0],@x[1]
615	eor	@x[2],@x[2],@x[3]
616	eor	@x[4],@x[4],@x[5]
617	eor	@x[6],@x[6],@x[7]
618	eor	@x[8],@x[8],@x[9]
619	eor	@x[10],@x[10],@x[11]
620	eor	@x[12],@x[12],@x[13]
621	eor	@x[14],@x[14],@x[15]
622
623	stp	@x[0],@x[2],[$out,#0]		// store output
624	 add	@d[6],@d[6],#4			// increment counter
625	stp	@x[4],@x[6],[$out,#16]
626	stp	@x[8],@x[10],[$out,#32]
627	stp	@x[12],@x[14],[$out,#48]
628	add	$out,$out,#64
629	b.eq	.Ldone_neon
630	sub	$len,$len,#64
631	cmp	$len,#64
632	b.lo	.Less_than_128
633
634	ld1.8	{$T0-$T3},[$inp],#64
635	eor	$A0,$A0,$T0
636	eor	$B0,$B0,$T1
637	eor	$C0,$C0,$T2
638	eor	$D0,$D0,$T3
639	st1.8	{$A0-$D0},[$out],#64
640	b.eq	.Ldone_neon
641	sub	$len,$len,#64
642	cmp	$len,#64
643	b.lo	.Less_than_192
644
645	ld1.8	{$T0-$T3},[$inp],#64
646	eor	$A1,$A1,$T0
647	eor	$B1,$B1,$T1
648	eor	$C1,$C1,$T2
649	eor	$D1,$D1,$T3
650	st1.8	{$A1-$D1},[$out],#64
651	b.eq	.Ldone_neon
652	sub	$len,$len,#64
653
654	st1.8	{$A2-$D2},[sp]
655	b	.Last_neon
656
657.Less_than_128:
658	st1.8	{$A0-$D0},[sp]
659	b	.Last_neon
660.Less_than_192:
661	st1.8	{$A1-$D1},[sp]
662	b	.Last_neon
663
664.align	4
665.Last_neon:
666	sub	$out,$out,#1
667	add	$inp,$inp,$len
668	add	$out,$out,$len
669	add	$ctr,sp,$len
670	neg	$len,$len
671
672.Loop_tail_neon:
673	ldrb	w10,[$inp,$len]
674	ldrb	w11,[$ctr,$len]
675	add	$len,$len,#1
676	eor	w10,w10,w11
677	strb	w10,[$out,$len]
678	cbnz	$len,.Loop_tail_neon
679
680	stp	xzr,xzr,[sp,#0]
681	stp	xzr,xzr,[sp,#16]
682	stp	xzr,xzr,[sp,#32]
683	stp	xzr,xzr,[sp,#48]
684
685.Ldone_neon:
686	ldp	x19,x20,[x29,#16]
687	add	sp,sp,#64
688	ldp	x21,x22,[x29,#32]
689	ldp	x23,x24,[x29,#48]
690	ldp	x25,x26,[x29,#64]
691	ldp	x27,x28,[x29,#80]
692	ldp	x29,x30,[sp],#96
693	.inst	0xd50323bf			// autiasp
694	ret
695.size	ChaCha20_neon,.-ChaCha20_neon
696___
697{
698my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
699my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
700    $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
701
702$code.=<<___;
703.type	ChaCha20_512_neon,%function
704.align	5
705ChaCha20_512_neon:
706	.inst	0xd503233f			// paciasp
707	stp	x29,x30,[sp,#-96]!
708	add	x29,sp,#0
709
710	adr	@x[0],.Lsigma
711	stp	x19,x20,[sp,#16]
712	stp	x21,x22,[sp,#32]
713	stp	x23,x24,[sp,#48]
714	stp	x25,x26,[sp,#64]
715	stp	x27,x28,[sp,#80]
716
717.L512_or_more_neon:
718	sub	sp,sp,#128+64
719
720	ldp	@d[0],@d[1],[@x[0]]		// load sigma
721	ld1	{@K[0]},[@x[0]],#16
722	ldp	@d[2],@d[3],[$key]		// load key
723	ldp	@d[4],@d[5],[$key,#16]
724	ld1	{@K[1],@K[2]},[$key]
725	ldp	@d[6],@d[7],[$ctr]		// load counter
726	ld1	{@K[3]},[$ctr]
727	ld1	{$ONE},[@x[0]]
728#ifdef	__ARMEB__
729	rev64	@K[0],@K[0]
730	ror	@d[2],@d[2],#32
731	ror	@d[3],@d[3],#32
732	ror	@d[4],@d[4],#32
733	ror	@d[5],@d[5],#32
734	ror	@d[6],@d[6],#32
735	ror	@d[7],@d[7],#32
736#endif
737	add	@K[3],@K[3],$ONE		// += 1
738	stp	@K[0],@K[1],[sp,#0]		// off-load key block, invariant part
739	add	@K[3],@K[3],$ONE		// not typo
740	str	@K[2],[sp,#32]
741	add	@K[4],@K[3],$ONE
742	add	@K[5],@K[4],$ONE
743	add	@K[6],@K[5],$ONE
744	shl	$ONE,$ONE,#2			// 1 -> 4
745
746	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
747	stp	d10,d11,[sp,#128+16]
748	stp	d12,d13,[sp,#128+32]
749	stp	d14,d15,[sp,#128+48]
750
751	sub	$len,$len,#512			// not typo
752
753.Loop_outer_512_neon:
754	 mov	$A0,@K[0]
755	 mov	$A1,@K[0]
756	 mov	$A2,@K[0]
757	 mov	$A3,@K[0]
758	 mov	$A4,@K[0]
759	 mov	$A5,@K[0]
760	 mov	$B0,@K[1]
761	mov.32	@x[0],@d[0]			// unpack key block
762	 mov	$B1,@K[1]
763	lsr	@x[1],@d[0],#32
764	 mov	$B2,@K[1]
765	mov.32	@x[2],@d[1]
766	 mov	$B3,@K[1]
767	lsr	@x[3],@d[1],#32
768	 mov	$B4,@K[1]
769	mov.32	@x[4],@d[2]
770	 mov	$B5,@K[1]
771	lsr	@x[5],@d[2],#32
772	 mov	$D0,@K[3]
773	mov.32	@x[6],@d[3]
774	 mov	$D1,@K[4]
775	lsr	@x[7],@d[3],#32
776	 mov	$D2,@K[5]
777	mov.32	@x[8],@d[4]
778	 mov	$D3,@K[6]
779	lsr	@x[9],@d[4],#32
780	 mov	$C0,@K[2]
781	mov.32	@x[10],@d[5]
782	 mov	$C1,@K[2]
783	lsr	@x[11],@d[5],#32
784	 add	$D4,$D0,$ONE			// +4
785	mov.32	@x[12],@d[6]
786	 add	$D5,$D1,$ONE			// +4
787	lsr	@x[13],@d[6],#32
788	 mov	$C2,@K[2]
789	mov.32	@x[14],@d[7]
790	 mov	$C3,@K[2]
791	lsr	@x[15],@d[7],#32
792	 mov	$C4,@K[2]
793	 stp	@K[3],@K[4],[sp,#48]		// off-load key block, variable part
794	 mov	$C5,@K[2]
795	 str	@K[5],[sp,#80]
796
797	mov	$ctr,#5
798	subs	$len,$len,#512
799.Loop_upper_neon:
800	sub	$ctr,$ctr,#1
801___
802	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
803	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
804	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
805	my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
806	my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
807	my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
808	my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
809	my $diff = ($#thread0+1)*6 - $#thread67 - 1;
810	my $i = 0;
811
812	foreach (@thread0) {
813		eval;			eval(shift(@thread67));
814		eval(shift(@thread1));	eval(shift(@thread67));
815		eval(shift(@thread2));	eval(shift(@thread67));
816		eval(shift(@thread3));	eval(shift(@thread67));
817		eval(shift(@thread4));	eval(shift(@thread67));
818		eval(shift(@thread5));	eval(shift(@thread67));
819	}
820
821	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
822	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
823	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
824	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
825	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
826	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
827	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
828
829	foreach (@thread0) {
830		eval;			eval(shift(@thread67));
831		eval(shift(@thread1));	eval(shift(@thread67));
832		eval(shift(@thread2));	eval(shift(@thread67));
833		eval(shift(@thread3));	eval(shift(@thread67));
834		eval(shift(@thread4));	eval(shift(@thread67));
835		eval(shift(@thread5));	eval(shift(@thread67));
836	}
837$code.=<<___;
838	cbnz	$ctr,.Loop_upper_neon
839
840	add.32	@x[0],@x[0],@d[0]		// accumulate key block
841	add	@x[1],@x[1],@d[0],lsr#32
842	add.32	@x[2],@x[2],@d[1]
843	add	@x[3],@x[3],@d[1],lsr#32
844	add.32	@x[4],@x[4],@d[2]
845	add	@x[5],@x[5],@d[2],lsr#32
846	add.32	@x[6],@x[6],@d[3]
847	add	@x[7],@x[7],@d[3],lsr#32
848	add.32	@x[8],@x[8],@d[4]
849	add	@x[9],@x[9],@d[4],lsr#32
850	add.32	@x[10],@x[10],@d[5]
851	add	@x[11],@x[11],@d[5],lsr#32
852	add.32	@x[12],@x[12],@d[6]
853	add	@x[13],@x[13],@d[6],lsr#32
854	add.32	@x[14],@x[14],@d[7]
855	add	@x[15],@x[15],@d[7],lsr#32
856
857	add	@x[0],@x[0],@x[1],lsl#32	// pack
858	add	@x[2],@x[2],@x[3],lsl#32
859	ldp	@x[1],@x[3],[$inp,#0]		// load input
860	add	@x[4],@x[4],@x[5],lsl#32
861	add	@x[6],@x[6],@x[7],lsl#32
862	ldp	@x[5],@x[7],[$inp,#16]
863	add	@x[8],@x[8],@x[9],lsl#32
864	add	@x[10],@x[10],@x[11],lsl#32
865	ldp	@x[9],@x[11],[$inp,#32]
866	add	@x[12],@x[12],@x[13],lsl#32
867	add	@x[14],@x[14],@x[15],lsl#32
868	ldp	@x[13],@x[15],[$inp,#48]
869	add	$inp,$inp,#64
870#ifdef	__ARMEB__
871	rev	@x[0],@x[0]
872	rev	@x[2],@x[2]
873	rev	@x[4],@x[4]
874	rev	@x[6],@x[6]
875	rev	@x[8],@x[8]
876	rev	@x[10],@x[10]
877	rev	@x[12],@x[12]
878	rev	@x[14],@x[14]
879#endif
880	eor	@x[0],@x[0],@x[1]
881	eor	@x[2],@x[2],@x[3]
882	eor	@x[4],@x[4],@x[5]
883	eor	@x[6],@x[6],@x[7]
884	eor	@x[8],@x[8],@x[9]
885	eor	@x[10],@x[10],@x[11]
886	eor	@x[12],@x[12],@x[13]
887	eor	@x[14],@x[14],@x[15]
888
889	 stp	@x[0],@x[2],[$out,#0]		// store output
890	 add	@d[6],@d[6],#1			// increment counter
891	mov.32	@x[0],@d[0]			// unpack key block
892	lsr	@x[1],@d[0],#32
893	 stp	@x[4],@x[6],[$out,#16]
894	mov.32	@x[2],@d[1]
895	lsr	@x[3],@d[1],#32
896	 stp	@x[8],@x[10],[$out,#32]
897	mov.32	@x[4],@d[2]
898	lsr	@x[5],@d[2],#32
899	 stp	@x[12],@x[14],[$out,#48]
900	 add	$out,$out,#64
901	mov.32	@x[6],@d[3]
902	lsr	@x[7],@d[3],#32
903	mov.32	@x[8],@d[4]
904	lsr	@x[9],@d[4],#32
905	mov.32	@x[10],@d[5]
906	lsr	@x[11],@d[5],#32
907	mov.32	@x[12],@d[6]
908	lsr	@x[13],@d[6],#32
909	mov.32	@x[14],@d[7]
910	lsr	@x[15],@d[7],#32
911
912	mov	$ctr,#5
913.Loop_lower_neon:
914	sub	$ctr,$ctr,#1
915___
916	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
917	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
918	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
919	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
920	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
921	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
922	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
923
924	foreach (@thread0) {
925		eval;			eval(shift(@thread67));
926		eval(shift(@thread1));	eval(shift(@thread67));
927		eval(shift(@thread2));	eval(shift(@thread67));
928		eval(shift(@thread3));	eval(shift(@thread67));
929		eval(shift(@thread4));	eval(shift(@thread67));
930		eval(shift(@thread5));	eval(shift(@thread67));
931	}
932
933	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
934	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
935	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
936	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
937	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
938	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
939	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
940
941	foreach (@thread0) {
942		eval;			eval(shift(@thread67));
943		eval(shift(@thread1));	eval(shift(@thread67));
944		eval(shift(@thread2));	eval(shift(@thread67));
945		eval(shift(@thread3));	eval(shift(@thread67));
946		eval(shift(@thread4));	eval(shift(@thread67));
947		eval(shift(@thread5));	eval(shift(@thread67));
948	}
949$code.=<<___;
950	cbnz	$ctr,.Loop_lower_neon
951
952	add.32	@x[0],@x[0],@d[0]		// accumulate key block
953	 ldp	@K[0],@K[1],[sp,#0]
954	add	@x[1],@x[1],@d[0],lsr#32
955	 ldp	@K[2],@K[3],[sp,#32]
956	add.32	@x[2],@x[2],@d[1]
957	 ldp	@K[4],@K[5],[sp,#64]
958	add	@x[3],@x[3],@d[1],lsr#32
959	 add	$A0,$A0,@K[0]
960	add.32	@x[4],@x[4],@d[2]
961	 add	$A1,$A1,@K[0]
962	add	@x[5],@x[5],@d[2],lsr#32
963	 add	$A2,$A2,@K[0]
964	add.32	@x[6],@x[6],@d[3]
965	 add	$A3,$A3,@K[0]
966	add	@x[7],@x[7],@d[3],lsr#32
967	 add	$A4,$A4,@K[0]
968	add.32	@x[8],@x[8],@d[4]
969	 add	$A5,$A5,@K[0]
970	add	@x[9],@x[9],@d[4],lsr#32
971	 add	$C0,$C0,@K[2]
972	add.32	@x[10],@x[10],@d[5]
973	 add	$C1,$C1,@K[2]
974	add	@x[11],@x[11],@d[5],lsr#32
975	 add	$C2,$C2,@K[2]
976	add.32	@x[12],@x[12],@d[6]
977	 add	$C3,$C3,@K[2]
978	add	@x[13],@x[13],@d[6],lsr#32
979	 add	$C4,$C4,@K[2]
980	add.32	@x[14],@x[14],@d[7]
981	 add	$C5,$C5,@K[2]
982	add	@x[15],@x[15],@d[7],lsr#32
983	 add	$D4,$D4,$ONE			// +4
984	add	@x[0],@x[0],@x[1],lsl#32	// pack
985	 add	$D5,$D5,$ONE			// +4
986	add	@x[2],@x[2],@x[3],lsl#32
987	 add	$D0,$D0,@K[3]
988	ldp	@x[1],@x[3],[$inp,#0]		// load input
989	 add	$D1,$D1,@K[4]
990	add	@x[4],@x[4],@x[5],lsl#32
991	 add	$D2,$D2,@K[5]
992	add	@x[6],@x[6],@x[7],lsl#32
993	 add	$D3,$D3,@K[6]
994	ldp	@x[5],@x[7],[$inp,#16]
995	 add	$D4,$D4,@K[3]
996	add	@x[8],@x[8],@x[9],lsl#32
997	 add	$D5,$D5,@K[4]
998	add	@x[10],@x[10],@x[11],lsl#32
999	 add	$B0,$B0,@K[1]
1000	ldp	@x[9],@x[11],[$inp,#32]
1001	 add	$B1,$B1,@K[1]
1002	add	@x[12],@x[12],@x[13],lsl#32
1003	 add	$B2,$B2,@K[1]
1004	add	@x[14],@x[14],@x[15],lsl#32
1005	 add	$B3,$B3,@K[1]
1006	ldp	@x[13],@x[15],[$inp,#48]
1007	 add	$B4,$B4,@K[1]
1008	add	$inp,$inp,#64
1009	 add	$B5,$B5,@K[1]
1010
1011#ifdef	__ARMEB__
1012	rev	@x[0],@x[0]
1013	rev	@x[2],@x[2]
1014	rev	@x[4],@x[4]
1015	rev	@x[6],@x[6]
1016	rev	@x[8],@x[8]
1017	rev	@x[10],@x[10]
1018	rev	@x[12],@x[12]
1019	rev	@x[14],@x[14]
1020#endif
1021	ld1.8	{$T0-$T3},[$inp],#64
1022	eor	@x[0],@x[0],@x[1]
1023	eor	@x[2],@x[2],@x[3]
1024	eor	@x[4],@x[4],@x[5]
1025	eor	@x[6],@x[6],@x[7]
1026	eor	@x[8],@x[8],@x[9]
1027	 eor	$A0,$A0,$T0
1028	eor	@x[10],@x[10],@x[11]
1029	 eor	$B0,$B0,$T1
1030	eor	@x[12],@x[12],@x[13]
1031	 eor	$C0,$C0,$T2
1032	eor	@x[14],@x[14],@x[15]
1033	 eor	$D0,$D0,$T3
1034	 ld1.8	{$T0-$T3},[$inp],#64
1035
1036	stp	@x[0],@x[2],[$out,#0]		// store output
1037	 add	@d[6],@d[6],#7			// increment counter
1038	stp	@x[4],@x[6],[$out,#16]
1039	stp	@x[8],@x[10],[$out,#32]
1040	stp	@x[12],@x[14],[$out,#48]
1041	add	$out,$out,#64
1042	st1.8	{$A0-$D0},[$out],#64
1043
1044	ld1.8	{$A0-$D0},[$inp],#64
1045	eor	$A1,$A1,$T0
1046	eor	$B1,$B1,$T1
1047	eor	$C1,$C1,$T2
1048	eor	$D1,$D1,$T3
1049	st1.8	{$A1-$D1},[$out],#64
1050
1051	ld1.8	{$A1-$D1},[$inp],#64
1052	eor	$A2,$A2,$A0
1053	 ldp	@K[0],@K[1],[sp,#0]
1054	eor	$B2,$B2,$B0
1055	 ldp	@K[2],@K[3],[sp,#32]
1056	eor	$C2,$C2,$C0
1057	eor	$D2,$D2,$D0
1058	st1.8	{$A2-$D2},[$out],#64
1059
1060	ld1.8	{$A2-$D2},[$inp],#64
1061	eor	$A3,$A3,$A1
1062	eor	$B3,$B3,$B1
1063	eor	$C3,$C3,$C1
1064	eor	$D3,$D3,$D1
1065	st1.8	{$A3-$D3},[$out],#64
1066
1067	ld1.8	{$A3-$D3},[$inp],#64
1068	eor	$A4,$A4,$A2
1069	eor	$B4,$B4,$B2
1070	eor	$C4,$C4,$C2
1071	eor	$D4,$D4,$D2
1072	st1.8	{$A4-$D4},[$out],#64
1073
1074	shl	$A0,$ONE,#1			// 4 -> 8
1075	eor	$A5,$A5,$A3
1076	eor	$B5,$B5,$B3
1077	eor	$C5,$C5,$C3
1078	eor	$D5,$D5,$D3
1079	st1.8	{$A5-$D5},[$out],#64
1080
1081	add	@K[3],@K[3],$A0			// += 8
1082	add	@K[4],@K[4],$A0
1083	add	@K[5],@K[5],$A0
1084	add	@K[6],@K[6],$A0
1085
1086	b.hs	.Loop_outer_512_neon
1087
1088	adds	$len,$len,#512
1089	ushr	$A0,$ONE,#2			// 4 -> 1
1090
1091	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1092	ldp	d10,d11,[sp,#128+16]
1093	ldp	d12,d13,[sp,#128+32]
1094	ldp	d14,d15,[sp,#128+48]
1095
1096	stp	@K[0],$ONE,[sp,#0]		// wipe off-load area
1097	stp	@K[0],$ONE,[sp,#32]
1098	stp	@K[0],$ONE,[sp,#64]
1099
1100	b.eq	.Ldone_512_neon
1101
1102	cmp	$len,#192
1103	sub	@K[3],@K[3],$A0			// -= 1
1104	sub	@K[4],@K[4],$A0
1105	sub	@K[5],@K[5],$A0
1106	add	sp,sp,#128
1107	b.hs	.Loop_outer_neon
1108
1109	eor	@K[1],@K[1],@K[1]
1110	eor	@K[2],@K[2],@K[2]
1111	eor	@K[3],@K[3],@K[3]
1112	eor	@K[4],@K[4],@K[4]
1113	eor	@K[5],@K[5],@K[5]
1114	eor	@K[6],@K[6],@K[6]
1115	b	.Loop_outer
1116
1117.Ldone_512_neon:
1118	ldp	x19,x20,[x29,#16]
1119	add	sp,sp,#128+64
1120	ldp	x21,x22,[x29,#32]
1121	ldp	x23,x24,[x29,#48]
1122	ldp	x25,x26,[x29,#64]
1123	ldp	x27,x28,[x29,#80]
1124	ldp	x29,x30,[sp],#96
1125	.inst	0xd50323bf			// autiasp
1126	ret
1127.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1128___
1129}
1130}}}
1131
1132foreach (split("\n",$code)) {
1133	s/\`([^\`]*)\`/eval $1/geo;
1134
1135	(s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1))	or
1136	(m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1))	or
1137	(s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1))	or
1138	(m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1))	or
1139	(s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1140
1141	#s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1142
1143	print $_,"\n";
1144}
1145close STDOUT or die "error closing STDOUT: $!";	# flush
1146