1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# December 2014
18#
19# ChaCha20 for ARMv4.
20#
21# Performance in cycles per byte out of large buffer.
22#
23#			IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
24#
25# Cortex-A5		19.3(*)/+95%    21.8        14.1
26# Cortex-A8		10.5(*)/+160%   13.9        6.35
27# Cortex-A9		12.9(**)/+110%  14.3        6.50
28# Cortex-A15		11.0/+40%       16.0        5.00
29# Snapdragon S4		11.5/+125%      13.6        4.90
30#
31# (*)	most "favourable" result for aligned data on little-endian
32#	processor, result for misaligned data is 10-15% lower;
33# (**)	this result is a trade-off: it can be improved by 20%,
34#	but then Snapdragon S4 and Cortex-A8 results get
35#	20-25% worse;
36
37$flavour = shift;
38if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
39else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
40
41if ($flavour && $flavour ne "void") {
42    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
44    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
45    die "can't locate arm-xlate.pl";
46
47    open STDOUT,"| \"$^X\" $xlate $flavour $output";
48} else {
49    open STDOUT,">$output";
50}
51
52sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
53{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
54  my $arg = pop;
55    $arg = "#$arg" if ($arg*1 eq $arg);
56    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
57}
58
59my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
60my @t=map("r$_",(8..11));
61
62sub ROUND {
63my ($a0,$b0,$c0,$d0)=@_;
64my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
65my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
66my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
67my $odd = $d0&1;
68my ($xc,$xc_) = (@t[0..1]);
69my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
70my @ret;
71
72	# Consider order in which variables are addressed by their
73	# index:
74	#
75	#       a   b   c   d
76	#
77	#       0   4   8  12 < even round
78	#       1   5   9  13
79	#       2   6  10  14
80	#       3   7  11  15
81	#       0   5  10  15 < odd round
82	#       1   6  11  12
83	#       2   7   8  13
84	#       3   4   9  14
85	#
86	# 'a', 'b' are permanently allocated in registers, @x[0..7],
87	# while 'c's and pair of 'd's are maintained in memory. If
88	# you observe 'c' column, you'll notice that pair of 'c's is
89	# invariant between rounds. This means that we have to reload
90	# them once per round, in the middle. This is why you'll see
91	# bunch of 'c' stores and loads in the middle, but none in
92	# the beginning or end. If you observe 'd' column, you'll
93	# notice that 15 and 13 are reused in next pair of rounds.
94	# This is why these two are chosen for offloading to memory,
95	# to make loads count more.
96							push @ret,(
97	"&add	(@x[$a0],@x[$a0],@x[$b0])",
98	"&mov	($xd,$xd,'ror#16')",
99	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
100	 "&mov	($xd_,$xd_,'ror#16')",
101	"&eor	($xd,$xd,@x[$a0],'ror#16')",
102	 "&eor	($xd_,$xd_,@x[$a1],'ror#16')",
103
104	"&add	($xc,$xc,$xd)",
105	"&mov	(@x[$b0],@x[$b0],'ror#20')",
106	 "&add	($xc_,$xc_,$xd_)",
107	 "&mov	(@x[$b1],@x[$b1],'ror#20')",
108	"&eor	(@x[$b0],@x[$b0],$xc,'ror#20')",
109	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#20')",
110
111	"&add	(@x[$a0],@x[$a0],@x[$b0])",
112	"&mov	($xd,$xd,'ror#24')",
113	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
114	 "&mov	($xd_,$xd_,'ror#24')",
115	"&eor	($xd,$xd,@x[$a0],'ror#24')",
116	 "&eor	($xd_,$xd_,@x[$a1],'ror#24')",
117
118	"&add	($xc,$xc,$xd)",
119	"&mov	(@x[$b0],@x[$b0],'ror#25')"		);
120							push @ret,(
121	"&str	($xd,'[sp,#4*(16+$d0)]')",
122	"&ldr	($xd,'[sp,#4*(16+$d2)]')"		) if ($odd);
123							push @ret,(
124	 "&add	($xc_,$xc_,$xd_)",
125	 "&mov	(@x[$b1],@x[$b1],'ror#25')"		);
126							push @ret,(
127	 "&str	($xd_,'[sp,#4*(16+$d1)]')",
128	 "&ldr	($xd_,'[sp,#4*(16+$d3)]')"		) if (!$odd);
129							push @ret,(
130	"&eor	(@x[$b0],@x[$b0],$xc,'ror#25')",
131	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#25')"	);
132
133	$xd=@x[$d2]					if (!$odd);
134	$xd_=@x[$d3]					if ($odd);
135							push @ret,(
136	"&str	($xc,'[sp,#4*(16+$c0)]')",
137	"&ldr	($xc,'[sp,#4*(16+$c2)]')",
138	"&add	(@x[$a2],@x[$a2],@x[$b2])",
139	"&mov	($xd,$xd,'ror#16')",
140	 "&str	($xc_,'[sp,#4*(16+$c1)]')",
141	 "&ldr	($xc_,'[sp,#4*(16+$c3)]')",
142	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
143	 "&mov	($xd_,$xd_,'ror#16')",
144	"&eor	($xd,$xd,@x[$a2],'ror#16')",
145	 "&eor	($xd_,$xd_,@x[$a3],'ror#16')",
146
147	"&add	($xc,$xc,$xd)",
148	"&mov	(@x[$b2],@x[$b2],'ror#20')",
149	 "&add	($xc_,$xc_,$xd_)",
150	 "&mov	(@x[$b3],@x[$b3],'ror#20')",
151	"&eor	(@x[$b2],@x[$b2],$xc,'ror#20')",
152	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#20')",
153
154	"&add	(@x[$a2],@x[$a2],@x[$b2])",
155	"&mov	($xd,$xd,'ror#24')",
156	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
157	 "&mov	($xd_,$xd_,'ror#24')",
158	"&eor	($xd,$xd,@x[$a2],'ror#24')",
159	 "&eor	($xd_,$xd_,@x[$a3],'ror#24')",
160
161	"&add	($xc,$xc,$xd)",
162	"&mov	(@x[$b2],@x[$b2],'ror#25')",
163	 "&add	($xc_,$xc_,$xd_)",
164	 "&mov	(@x[$b3],@x[$b3],'ror#25')",
165	"&eor	(@x[$b2],@x[$b2],$xc,'ror#25')",
166	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#25')"	);
167
168	@ret;
169}
170
171$code.=<<___;
172#include "arm_arch.h"
173
174.text
175#if defined(__thumb2__) || defined(__clang__)
176.syntax	unified
177#endif
178#if defined(__thumb2__)
179.thumb
180#else
181.code	32
182#endif
183
184#if defined(__thumb2__) || defined(__clang__)
185#define ldrhsb	ldrbhs
186#endif
187
188.align	5
189.Lsigma:
190.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
191.Lone:
192.long	1,0,0,0
193#if __ARM_MAX_ARCH__>=7
194.LOPENSSL_armcap:
195.word   OPENSSL_armcap_P-.LChaCha20_ctr32
196#else
197.word	-1
198#endif
199
200.globl	ChaCha20_ctr32
201.type	ChaCha20_ctr32,%function
202.align	5
203ChaCha20_ctr32:
204.LChaCha20_ctr32:
205	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
206	stmdb	sp!,{r0-r2,r4-r11,lr}
207#if __ARM_ARCH__<7 && !defined(__thumb2__)
208	sub	r14,pc,#16		@ ChaCha20_ctr32
209#else
210	adr	r14,.LChaCha20_ctr32
211#endif
212	cmp	r2,#0			@ len==0?
213#ifdef	__thumb2__
214	itt	eq
215#endif
216	addeq	sp,sp,#4*3
217	beq	.Lno_data
218#if __ARM_MAX_ARCH__>=7
219	cmp	r2,#192			@ test len
220	bls	.Lshort
221	ldr	r4,[r14,#-32]
222	ldr	r4,[r14,r4]
223# ifdef	__APPLE__
224	ldr	r4,[r4]
225# endif
226	tst	r4,#ARMV7_NEON
227	bne	.LChaCha20_neon
228.Lshort:
229#endif
230	ldmia	r12,{r4-r7}		@ load counter and nonce
231	sub	sp,sp,#4*(16)		@ off-load area
232	sub	r14,r14,#64		@ .Lsigma
233	stmdb	sp!,{r4-r7}		@ copy counter and nonce
234	ldmia	r3,{r4-r11}		@ load key
235	ldmia	r14,{r0-r3}		@ load sigma
236	stmdb	sp!,{r4-r11}		@ copy key
237	stmdb	sp!,{r0-r3}		@ copy sigma
238	str	r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
239	str	r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
240	b	.Loop_outer_enter
241
242.align	4
243.Loop_outer:
244	ldmia	sp,{r0-r9}		@ load key material
245	str	@t[3],[sp,#4*(32+2)]	@ save len
246	str	r12,  [sp,#4*(32+1)]	@ save inp
247	str	r14,  [sp,#4*(32+0)]	@ save out
248.Loop_outer_enter:
249	ldr	@t[3], [sp,#4*(15)]
250	ldr	@x[12],[sp,#4*(12)]	@ modulo-scheduled load
251	ldr	@t[2], [sp,#4*(13)]
252	ldr	@x[14],[sp,#4*(14)]
253	str	@t[3], [sp,#4*(16+15)]
254	mov	@t[3],#10
255	b	.Loop
256
257.align	4
258.Loop:
259	subs	@t[3],@t[3],#1
260___
261	foreach (&ROUND(0, 4, 8,12)) { eval; }
262	foreach (&ROUND(0, 5,10,15)) { eval; }
263$code.=<<___;
264	bne	.Loop
265
266	ldr	@t[3],[sp,#4*(32+2)]	@ load len
267
268	str	@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
269	str	@t[1], [sp,#4*(16+9)]
270	str	@x[12],[sp,#4*(16+12)]
271	str	@t[2], [sp,#4*(16+13)]
272	str	@x[14],[sp,#4*(16+14)]
273
274	@ at this point we have first half of 512-bit result in
275	@ @x[0-7] and second half at sp+4*(16+8)
276
277	cmp	@t[3],#64		@ done yet?
278#ifdef	__thumb2__
279	itete	lo
280#endif
281	addlo	r12,sp,#4*(0)		@ shortcut or ...
282	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
283	addlo	r14,sp,#4*(0)		@ shortcut or ...
284	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
285
286	ldr	@t[0],[sp,#4*(0)]	@ load key material
287	ldr	@t[1],[sp,#4*(1)]
288
289#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
290# if __ARM_ARCH__<7
291	orr	@t[2],r12,r14
292	tst	@t[2],#3		@ are input and output aligned?
293	ldr	@t[2],[sp,#4*(2)]
294	bne	.Lunaligned
295	cmp	@t[3],#64		@ restore flags
296# else
297	ldr	@t[2],[sp,#4*(2)]
298# endif
299	ldr	@t[3],[sp,#4*(3)]
300
301	add	@x[0],@x[0],@t[0]	@ accumulate key material
302	add	@x[1],@x[1],@t[1]
303# ifdef	__thumb2__
304	itt	hs
305# endif
306	ldrhs	@t[0],[r12],#16		@ load input
307	ldrhs	@t[1],[r12,#-12]
308
309	add	@x[2],@x[2],@t[2]
310	add	@x[3],@x[3],@t[3]
311# ifdef	__thumb2__
312	itt	hs
313# endif
314	ldrhs	@t[2],[r12,#-8]
315	ldrhs	@t[3],[r12,#-4]
316# if __ARM_ARCH__>=6 && defined(__ARMEB__)
317	rev	@x[0],@x[0]
318	rev	@x[1],@x[1]
319	rev	@x[2],@x[2]
320	rev	@x[3],@x[3]
321# endif
322# ifdef	__thumb2__
323	itt	hs
324# endif
325	eorhs	@x[0],@x[0],@t[0]	@ xor with input
326	eorhs	@x[1],@x[1],@t[1]
327	 add	@t[0],sp,#4*(4)
328	str	@x[0],[r14],#16		@ store output
329# ifdef	__thumb2__
330	itt	hs
331# endif
332	eorhs	@x[2],@x[2],@t[2]
333	eorhs	@x[3],@x[3],@t[3]
334	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
335	str	@x[1],[r14,#-12]
336	str	@x[2],[r14,#-8]
337	str	@x[3],[r14,#-4]
338
339	add	@x[4],@x[4],@t[0]	@ accumulate key material
340	add	@x[5],@x[5],@t[1]
341# ifdef	__thumb2__
342	itt	hs
343# endif
344	ldrhs	@t[0],[r12],#16		@ load input
345	ldrhs	@t[1],[r12,#-12]
346	add	@x[6],@x[6],@t[2]
347	add	@x[7],@x[7],@t[3]
348# ifdef	__thumb2__
349	itt	hs
350# endif
351	ldrhs	@t[2],[r12,#-8]
352	ldrhs	@t[3],[r12,#-4]
353# if __ARM_ARCH__>=6 && defined(__ARMEB__)
354	rev	@x[4],@x[4]
355	rev	@x[5],@x[5]
356	rev	@x[6],@x[6]
357	rev	@x[7],@x[7]
358# endif
359# ifdef	__thumb2__
360	itt	hs
361# endif
362	eorhs	@x[4],@x[4],@t[0]
363	eorhs	@x[5],@x[5],@t[1]
364	 add	@t[0],sp,#4*(8)
365	str	@x[4],[r14],#16		@ store output
366# ifdef	__thumb2__
367	itt	hs
368# endif
369	eorhs	@x[6],@x[6],@t[2]
370	eorhs	@x[7],@x[7],@t[3]
371	str	@x[5],[r14,#-12]
372	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
373	str	@x[6],[r14,#-8]
374	 add	@x[0],sp,#4*(16+8)
375	str	@x[7],[r14,#-4]
376
377	ldmia	@x[0],{@x[0]-@x[7]}	@ load second half
378
379	add	@x[0],@x[0],@t[0]	@ accumulate key material
380	add	@x[1],@x[1],@t[1]
381# ifdef	__thumb2__
382	itt	hs
383# endif
384	ldrhs	@t[0],[r12],#16		@ load input
385	ldrhs	@t[1],[r12,#-12]
386# ifdef	__thumb2__
387	itt	hi
388# endif
389	 strhi	@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
390	 strhi	@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
391	add	@x[2],@x[2],@t[2]
392	add	@x[3],@x[3],@t[3]
393# ifdef	__thumb2__
394	itt	hs
395# endif
396	ldrhs	@t[2],[r12,#-8]
397	ldrhs	@t[3],[r12,#-4]
398# if __ARM_ARCH__>=6 && defined(__ARMEB__)
399	rev	@x[0],@x[0]
400	rev	@x[1],@x[1]
401	rev	@x[2],@x[2]
402	rev	@x[3],@x[3]
403# endif
404# ifdef	__thumb2__
405	itt	hs
406# endif
407	eorhs	@x[0],@x[0],@t[0]
408	eorhs	@x[1],@x[1],@t[1]
409	 add	@t[0],sp,#4*(12)
410	str	@x[0],[r14],#16		@ store output
411# ifdef	__thumb2__
412	itt	hs
413# endif
414	eorhs	@x[2],@x[2],@t[2]
415	eorhs	@x[3],@x[3],@t[3]
416	str	@x[1],[r14,#-12]
417	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
418	str	@x[2],[r14,#-8]
419	str	@x[3],[r14,#-4]
420
421	add	@x[4],@x[4],@t[0]	@ accumulate key material
422	add	@x[5],@x[5],@t[1]
423# ifdef	__thumb2__
424	itt	hi
425# endif
426	 addhi	@t[0],@t[0],#1		@ next counter value
427	 strhi	@t[0],[sp,#4*(12)]	@ save next counter value
428# ifdef	__thumb2__
429	itt	hs
430# endif
431	ldrhs	@t[0],[r12],#16		@ load input
432	ldrhs	@t[1],[r12,#-12]
433	add	@x[6],@x[6],@t[2]
434	add	@x[7],@x[7],@t[3]
435# ifdef	__thumb2__
436	itt	hs
437# endif
438	ldrhs	@t[2],[r12,#-8]
439	ldrhs	@t[3],[r12,#-4]
440# if __ARM_ARCH__>=6 && defined(__ARMEB__)
441	rev	@x[4],@x[4]
442	rev	@x[5],@x[5]
443	rev	@x[6],@x[6]
444	rev	@x[7],@x[7]
445# endif
446# ifdef	__thumb2__
447	itt	hs
448# endif
449	eorhs	@x[4],@x[4],@t[0]
450	eorhs	@x[5],@x[5],@t[1]
451# ifdef	__thumb2__
452	 it	ne
453# endif
454	 ldrne	@t[0],[sp,#4*(32+2)]	@ re-load len
455# ifdef	__thumb2__
456	itt	hs
457# endif
458	eorhs	@x[6],@x[6],@t[2]
459	eorhs	@x[7],@x[7],@t[3]
460	str	@x[4],[r14],#16		@ store output
461	str	@x[5],[r14,#-12]
462# ifdef	__thumb2__
463	it	hs
464# endif
465	 subhs	@t[3],@t[0],#64		@ len-=64
466	str	@x[6],[r14,#-8]
467	str	@x[7],[r14,#-4]
468	bhi	.Loop_outer
469
470	beq	.Ldone
471# if __ARM_ARCH__<7
472	b	.Ltail
473
474.align	4
475.Lunaligned:				@ unaligned endian-neutral path
476	cmp	@t[3],#64		@ restore flags
477# endif
478#endif
479#if __ARM_ARCH__<7
480	ldr	@t[3],[sp,#4*(3)]
481___
482for ($i=0;$i<16;$i+=4) {
483my $j=$i&0x7;
484
485$code.=<<___	if ($i==4);
486	add	@x[0],sp,#4*(16+8)
487___
488$code.=<<___	if ($i==8);
489	ldmia	@x[0],{@x[0]-@x[7]}		@ load second half
490# ifdef	__thumb2__
491	itt	hi
492# endif
493	strhi	@t[2],[sp,#4*(16+10)]		@ copy "@x[10]"
494	strhi	@t[3],[sp,#4*(16+11)]		@ copy "@x[11]"
495___
496$code.=<<___;
497	add	@x[$j+0],@x[$j+0],@t[0]		@ accumulate key material
498___
499$code.=<<___	if ($i==12);
500# ifdef	__thumb2__
501	itt	hi
502# endif
503	addhi	@t[0],@t[0],#1			@ next counter value
504	strhi	@t[0],[sp,#4*(12)]		@ save next counter value
505___
506$code.=<<___;
507	add	@x[$j+1],@x[$j+1],@t[1]
508	add	@x[$j+2],@x[$j+2],@t[2]
509# ifdef	__thumb2__
510	itete	lo
511# endif
512	eorlo	@t[0],@t[0],@t[0]		@ zero or ...
513	ldrhsb	@t[0],[r12],#16			@ ... load input
514	eorlo	@t[1],@t[1],@t[1]
515	ldrhsb	@t[1],[r12,#-12]
516
517	add	@x[$j+3],@x[$j+3],@t[3]
518# ifdef	__thumb2__
519	itete	lo
520# endif
521	eorlo	@t[2],@t[2],@t[2]
522	ldrhsb	@t[2],[r12,#-8]
523	eorlo	@t[3],@t[3],@t[3]
524	ldrhsb	@t[3],[r12,#-4]
525
526	eor	@x[$j+0],@t[0],@x[$j+0]		@ xor with input (or zero)
527	eor	@x[$j+1],@t[1],@x[$j+1]
528# ifdef	__thumb2__
529	itt	hs
530# endif
531	ldrhsb	@t[0],[r12,#-15]		@ load more input
532	ldrhsb	@t[1],[r12,#-11]
533	eor	@x[$j+2],@t[2],@x[$j+2]
534	 strb	@x[$j+0],[r14],#16		@ store output
535	eor	@x[$j+3],@t[3],@x[$j+3]
536# ifdef	__thumb2__
537	itt	hs
538# endif
539	ldrhsb	@t[2],[r12,#-7]
540	ldrhsb	@t[3],[r12,#-3]
541	 strb	@x[$j+1],[r14,#-12]
542	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
543	 strb	@x[$j+2],[r14,#-8]
544	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
545# ifdef	__thumb2__
546	itt	hs
547# endif
548	ldrhsb	@t[0],[r12,#-14]		@ load more input
549	ldrhsb	@t[1],[r12,#-10]
550	 strb	@x[$j+3],[r14,#-4]
551	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
552	 strb	@x[$j+0],[r14,#-15]
553	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
554# ifdef	__thumb2__
555	itt	hs
556# endif
557	ldrhsb	@t[2],[r12,#-6]
558	ldrhsb	@t[3],[r12,#-2]
559	 strb	@x[$j+1],[r14,#-11]
560	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
561	 strb	@x[$j+2],[r14,#-7]
562	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
563# ifdef	__thumb2__
564	itt	hs
565# endif
566	ldrhsb	@t[0],[r12,#-13]		@ load more input
567	ldrhsb	@t[1],[r12,#-9]
568	 strb	@x[$j+3],[r14,#-3]
569	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
570	 strb	@x[$j+0],[r14,#-14]
571	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
572# ifdef	__thumb2__
573	itt	hs
574# endif
575	ldrhsb	@t[2],[r12,#-5]
576	ldrhsb	@t[3],[r12,#-1]
577	 strb	@x[$j+1],[r14,#-10]
578	 strb	@x[$j+2],[r14,#-6]
579	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
580	 strb	@x[$j+3],[r14,#-2]
581	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
582	 strb	@x[$j+0],[r14,#-13]
583	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
584	 strb	@x[$j+1],[r14,#-9]
585	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
586	 strb	@x[$j+2],[r14,#-5]
587	 strb	@x[$j+3],[r14,#-1]
588___
589$code.=<<___	if ($i<12);
590	add	@t[0],sp,#4*(4+$i)
591	ldmia	@t[0],{@t[0]-@t[3]}		@ load key material
592___
593}
594$code.=<<___;
595# ifdef	__thumb2__
596	it	ne
597# endif
598	ldrne	@t[0],[sp,#4*(32+2)]		@ re-load len
599# ifdef	__thumb2__
600	it	hs
601# endif
602	subhs	@t[3],@t[0],#64			@ len-=64
603	bhi	.Loop_outer
604
605	beq	.Ldone
606#endif
607
608.Ltail:
609	ldr	r12,[sp,#4*(32+1)]	@ load inp
610	add	@t[1],sp,#4*(0)
611	ldr	r14,[sp,#4*(32+0)]	@ load out
612
613.Loop_tail:
614	ldrb	@t[2],[@t[1]],#1	@ read buffer on stack
615	ldrb	@t[3],[r12],#1		@ read input
616	subs	@t[0],@t[0],#1
617	eor	@t[3],@t[3],@t[2]
618	strb	@t[3],[r14],#1		@ store output
619	bne	.Loop_tail
620
621.Ldone:
622	add	sp,sp,#4*(32+3)
623.Lno_data:
624	ldmia	sp!,{r4-r11,pc}
625.size	ChaCha20_ctr32,.-ChaCha20_ctr32
626___
627
628{{{
629my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
630    map("q$_",(0..15));
631
632sub NEONROUND {
633my $odd = pop;
634my ($a,$b,$c,$d,$t)=@_;
635
636	(
637	"&vadd_i32	($a,$a,$b)",
638	"&veor		($d,$d,$a)",
639	"&vrev32_16	($d,$d)",	# vrot ($d,16)
640
641	"&vadd_i32	($c,$c,$d)",
642	"&veor		($t,$b,$c)",
643	"&vshr_u32	($b,$t,20)",
644	"&vsli_32	($b,$t,12)",
645
646	"&vadd_i32	($a,$a,$b)",
647	"&veor		($t,$d,$a)",
648	"&vshr_u32	($d,$t,24)",
649	"&vsli_32	($d,$t,8)",
650
651	"&vadd_i32	($c,$c,$d)",
652	"&veor		($t,$b,$c)",
653	"&vshr_u32	($b,$t,25)",
654	"&vsli_32	($b,$t,7)",
655
656	"&vext_8	($c,$c,$c,8)",
657	"&vext_8	($b,$b,$b,$odd?12:4)",
658	"&vext_8	($d,$d,$d,$odd?4:12)"
659	);
660}
661
662$code.=<<___;
663#if __ARM_MAX_ARCH__>=7
664.arch	armv7-a
665.fpu	neon
666
667.type	ChaCha20_neon,%function
668.align	5
669ChaCha20_neon:
670	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
671	stmdb		sp!,{r0-r2,r4-r11,lr}
672.LChaCha20_neon:
673	adr		r14,.Lsigma
674	vstmdb		sp!,{d8-d15}		@ ABI spec says so
675	stmdb		sp!,{r0-r3}
676
677	vld1.32		{$b0-$c0},[r3]		@ load key
678	ldmia		r3,{r4-r11}		@ load key
679
680	sub		sp,sp,#4*(16+16)
681	vld1.32		{$d0},[r12]		@ load counter and nonce
682	add		r12,sp,#4*8
683	ldmia		r14,{r0-r3}		@ load sigma
684	vld1.32		{$a0},[r14]!		@ load sigma
685	vld1.32		{$t0},[r14]		@ one
686	vst1.32		{$c0-$d0},[r12]		@ copy 1/2key|counter|nonce
687	vst1.32		{$a0-$b0},[sp]		@ copy sigma|1/2key
688
689	str		r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
690	str		r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
691	vshl.i32	$t1#lo,$t0#lo,#1	@ two
692	vstr		$t0#lo,[sp,#4*(16+0)]
693	vshl.i32	$t2#lo,$t0#lo,#2	@ four
694	vstr		$t1#lo,[sp,#4*(16+2)]
695	vmov		$a1,$a0
696	vstr		$t2#lo,[sp,#4*(16+4)]
697	vmov		$a2,$a0
698	vmov		$b1,$b0
699	vmov		$b2,$b0
700	b		.Loop_neon_enter
701
702.align	4
703.Loop_neon_outer:
704	ldmia		sp,{r0-r9}		@ load key material
705	cmp		@t[3],#64*2		@ if len<=64*2
706	bls		.Lbreak_neon		@ switch to integer-only
707	vmov		$a1,$a0
708	str		@t[3],[sp,#4*(32+2)]	@ save len
709	vmov		$a2,$a0
710	str		r12,  [sp,#4*(32+1)]	@ save inp
711	vmov		$b1,$b0
712	str		r14,  [sp,#4*(32+0)]	@ save out
713	vmov		$b2,$b0
714.Loop_neon_enter:
715	ldr		@t[3], [sp,#4*(15)]
716	vadd.i32	$d1,$d0,$t0		@ counter+1
717	ldr		@x[12],[sp,#4*(12)]	@ modulo-scheduled load
718	vmov		$c1,$c0
719	ldr		@t[2], [sp,#4*(13)]
720	vmov		$c2,$c0
721	ldr		@x[14],[sp,#4*(14)]
722	vadd.i32	$d2,$d1,$t0		@ counter+2
723	str		@t[3], [sp,#4*(16+15)]
724	mov		@t[3],#10
725	add		@x[12],@x[12],#3	@ counter+3
726	b		.Loop_neon
727
728.align	4
729.Loop_neon:
730	subs		@t[3],@t[3],#1
731___
732	my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
733	my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
734	my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
735	my @thread3=&ROUND(0,4,8,12);
736
737	foreach (@thread0) {
738		eval;			eval(shift(@thread3));
739		eval(shift(@thread1));	eval(shift(@thread3));
740		eval(shift(@thread2));	eval(shift(@thread3));
741	}
742
743	@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
744	@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
745	@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
746	@thread3=&ROUND(0,5,10,15);
747
748	foreach (@thread0) {
749		eval;			eval(shift(@thread3));
750		eval(shift(@thread1));	eval(shift(@thread3));
751		eval(shift(@thread2));	eval(shift(@thread3));
752	}
753$code.=<<___;
754	bne		.Loop_neon
755
756	add		@t[3],sp,#32
757	vld1.32		{$t0-$t1},[sp]		@ load key material
758	vld1.32		{$t2-$t3},[@t[3]]
759
760	ldr		@t[3],[sp,#4*(32+2)]	@ load len
761
762	str		@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
763	str		@t[1], [sp,#4*(16+9)]
764	str		@x[12],[sp,#4*(16+12)]
765	str		@t[2], [sp,#4*(16+13)]
766	str		@x[14],[sp,#4*(16+14)]
767
768	@ at this point we have first half of 512-bit result in
769	@ @x[0-7] and second half at sp+4*(16+8)
770
771	ldr		r12,[sp,#4*(32+1)]	@ load inp
772	ldr		r14,[sp,#4*(32+0)]	@ load out
773
774	vadd.i32	$a0,$a0,$t0		@ accumulate key material
775	vadd.i32	$a1,$a1,$t0
776	vadd.i32	$a2,$a2,$t0
777	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
778
779	vadd.i32	$b0,$b0,$t1
780	vadd.i32	$b1,$b1,$t1
781	vadd.i32	$b2,$b2,$t1
782	vldr		$t1#lo,[sp,#4*(16+2)]	@ two
783
784	vadd.i32	$c0,$c0,$t2
785	vadd.i32	$c1,$c1,$t2
786	vadd.i32	$c2,$c2,$t2
787	vadd.i32	$d1#lo,$d1#lo,$t0#lo	@ counter+1
788	vadd.i32	$d2#lo,$d2#lo,$t1#lo	@ counter+2
789
790	vadd.i32	$d0,$d0,$t3
791	vadd.i32	$d1,$d1,$t3
792	vadd.i32	$d2,$d2,$t3
793
794	cmp		@t[3],#64*4
795	blo		.Ltail_neon
796
797	vld1.8		{$t0-$t1},[r12]!	@ load input
798	 mov		@t[3],sp
799	vld1.8		{$t2-$t3},[r12]!
800	veor		$a0,$a0,$t0		@ xor with input
801	veor		$b0,$b0,$t1
802	vld1.8		{$t0-$t1},[r12]!
803	veor		$c0,$c0,$t2
804	veor		$d0,$d0,$t3
805	vld1.8		{$t2-$t3},[r12]!
806
807	veor		$a1,$a1,$t0
808	 vst1.8		{$a0-$b0},[r14]!	@ store output
809	veor		$b1,$b1,$t1
810	vld1.8		{$t0-$t1},[r12]!
811	veor		$c1,$c1,$t2
812	 vst1.8		{$c0-$d0},[r14]!
813	veor		$d1,$d1,$t3
814	vld1.8		{$t2-$t3},[r12]!
815
816	veor		$a2,$a2,$t0
817	 vld1.32	{$a0-$b0},[@t[3]]!	@ load for next iteration
818	 veor		$t0#hi,$t0#hi,$t0#hi
819	 vldr		$t0#lo,[sp,#4*(16+4)]	@ four
820	veor		$b2,$b2,$t1
821	 vld1.32	{$c0-$d0},[@t[3]]
822	veor		$c2,$c2,$t2
823	 vst1.8		{$a1-$b1},[r14]!
824	veor		$d2,$d2,$t3
825	 vst1.8		{$c1-$d1},[r14]!
826
827	vadd.i32	$d0#lo,$d0#lo,$t0#lo	@ next counter value
828	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
829
830	ldmia		sp,{@t[0]-@t[3]}	@ load key material
831	add		@x[0],@x[0],@t[0]	@ accumulate key material
832	ldr		@t[0],[r12],#16		@ load input
833	 vst1.8		{$a2-$b2},[r14]!
834	add		@x[1],@x[1],@t[1]
835	ldr		@t[1],[r12,#-12]
836	 vst1.8		{$c2-$d2},[r14]!
837	add		@x[2],@x[2],@t[2]
838	ldr		@t[2],[r12,#-8]
839	add		@x[3],@x[3],@t[3]
840	ldr		@t[3],[r12,#-4]
841# ifdef	__ARMEB__
842	rev		@x[0],@x[0]
843	rev		@x[1],@x[1]
844	rev		@x[2],@x[2]
845	rev		@x[3],@x[3]
846# endif
847	eor		@x[0],@x[0],@t[0]	@ xor with input
848	 add		@t[0],sp,#4*(4)
849	eor		@x[1],@x[1],@t[1]
850	str		@x[0],[r14],#16		@ store output
851	eor		@x[2],@x[2],@t[2]
852	str		@x[1],[r14,#-12]
853	eor		@x[3],@x[3],@t[3]
854	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
855	str		@x[2],[r14,#-8]
856	str		@x[3],[r14,#-4]
857
858	add		@x[4],@x[4],@t[0]	@ accumulate key material
859	ldr		@t[0],[r12],#16		@ load input
860	add		@x[5],@x[5],@t[1]
861	ldr		@t[1],[r12,#-12]
862	add		@x[6],@x[6],@t[2]
863	ldr		@t[2],[r12,#-8]
864	add		@x[7],@x[7],@t[3]
865	ldr		@t[3],[r12,#-4]
866# ifdef	__ARMEB__
867	rev		@x[4],@x[4]
868	rev		@x[5],@x[5]
869	rev		@x[6],@x[6]
870	rev		@x[7],@x[7]
871# endif
872	eor		@x[4],@x[4],@t[0]
873	 add		@t[0],sp,#4*(8)
874	eor		@x[5],@x[5],@t[1]
875	str		@x[4],[r14],#16		@ store output
876	eor		@x[6],@x[6],@t[2]
877	str		@x[5],[r14,#-12]
878	eor		@x[7],@x[7],@t[3]
879	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
880	str		@x[6],[r14,#-8]
881	 add		@x[0],sp,#4*(16+8)
882	str		@x[7],[r14,#-4]
883
884	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
885
886	add		@x[0],@x[0],@t[0]	@ accumulate key material
887	ldr		@t[0],[r12],#16		@ load input
888	add		@x[1],@x[1],@t[1]
889	ldr		@t[1],[r12,#-12]
890# ifdef	__thumb2__
891	it	hi
892# endif
893	 strhi		@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
894	add		@x[2],@x[2],@t[2]
895	ldr		@t[2],[r12,#-8]
896# ifdef	__thumb2__
897	it	hi
898# endif
899	 strhi		@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
900	add		@x[3],@x[3],@t[3]
901	ldr		@t[3],[r12,#-4]
902# ifdef	__ARMEB__
903	rev		@x[0],@x[0]
904	rev		@x[1],@x[1]
905	rev		@x[2],@x[2]
906	rev		@x[3],@x[3]
907# endif
908	eor		@x[0],@x[0],@t[0]
909	 add		@t[0],sp,#4*(12)
910	eor		@x[1],@x[1],@t[1]
911	str		@x[0],[r14],#16		@ store output
912	eor		@x[2],@x[2],@t[2]
913	str		@x[1],[r14,#-12]
914	eor		@x[3],@x[3],@t[3]
915	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
916	str		@x[2],[r14,#-8]
917	str		@x[3],[r14,#-4]
918
919	add		@x[4],@x[4],@t[0]	@ accumulate key material
920	 add		@t[0],@t[0],#4		@ next counter value
921	add		@x[5],@x[5],@t[1]
922	 str		@t[0],[sp,#4*(12)]	@ save next counter value
923	ldr		@t[0],[r12],#16		@ load input
924	add		@x[6],@x[6],@t[2]
925	 add		@x[4],@x[4],#3		@ counter+3
926	ldr		@t[1],[r12,#-12]
927	add		@x[7],@x[7],@t[3]
928	ldr		@t[2],[r12,#-8]
929	ldr		@t[3],[r12,#-4]
930# ifdef	__ARMEB__
931	rev		@x[4],@x[4]
932	rev		@x[5],@x[5]
933	rev		@x[6],@x[6]
934	rev		@x[7],@x[7]
935# endif
936	eor		@x[4],@x[4],@t[0]
937# ifdef	__thumb2__
938	it	hi
939# endif
940	 ldrhi		@t[0],[sp,#4*(32+2)]	@ re-load len
941	eor		@x[5],@x[5],@t[1]
942	eor		@x[6],@x[6],@t[2]
943	str		@x[4],[r14],#16		@ store output
944	eor		@x[7],@x[7],@t[3]
945	str		@x[5],[r14,#-12]
946	 sub		@t[3],@t[0],#64*4	@ len-=64*4
947	str		@x[6],[r14,#-8]
948	str		@x[7],[r14,#-4]
949	bhi		.Loop_neon_outer
950
951	b		.Ldone_neon
952
953.align	4
954.Lbreak_neon:
955	@ harmonize NEON and integer-only stack frames: load data
956	@ from NEON frame, but save to integer-only one; distance
957	@ between the two is 4*(32+4+16-32)=4*(20).
958
959	str		@t[3], [sp,#4*(20+32+2)]	@ save len
960	 add		@t[3],sp,#4*(32+4)
961	str		r12,   [sp,#4*(20+32+1)]	@ save inp
962	str		r14,   [sp,#4*(20+32+0)]	@ save out
963
964	ldr		@x[12],[sp,#4*(16+10)]
965	ldr		@x[14],[sp,#4*(16+11)]
966	 vldmia		@t[3],{d8-d15}			@ fulfill ABI requirement
967	str		@x[12],[sp,#4*(20+16+10)]	@ copy "@x[10]"
968	str		@x[14],[sp,#4*(20+16+11)]	@ copy "@x[11]"
969
970	ldr		@t[3], [sp,#4*(15)]
971	ldr		@x[12],[sp,#4*(12)]		@ modulo-scheduled load
972	ldr		@t[2], [sp,#4*(13)]
973	ldr		@x[14],[sp,#4*(14)]
974	str		@t[3], [sp,#4*(20+16+15)]
975	add		@t[3],sp,#4*(20)
976	vst1.32		{$a0-$b0},[@t[3]]!		@ copy key
977	add		sp,sp,#4*(20)			@ switch frame
978	vst1.32		{$c0-$d0},[@t[3]]
979	mov		@t[3],#10
980	b		.Loop				@ go integer-only
981
982.align	4
983.Ltail_neon:
984	cmp		@t[3],#64*3
985	bhs		.L192_or_more_neon
986	cmp		@t[3],#64*2
987	bhs		.L128_or_more_neon
988	cmp		@t[3],#64*1
989	bhs		.L64_or_more_neon
990
991	add		@t[0],sp,#4*(8)
992	vst1.8		{$a0-$b0},[sp]
993	add		@t[2],sp,#4*(0)
994	vst1.8		{$c0-$d0},[@t[0]]
995	b		.Loop_tail_neon
996
997.align	4
998.L64_or_more_neon:
999	vld1.8		{$t0-$t1},[r12]!
1000	vld1.8		{$t2-$t3},[r12]!
1001	veor		$a0,$a0,$t0
1002	veor		$b0,$b0,$t1
1003	veor		$c0,$c0,$t2
1004	veor		$d0,$d0,$t3
1005	vst1.8		{$a0-$b0},[r14]!
1006	vst1.8		{$c0-$d0},[r14]!
1007
1008	beq		.Ldone_neon
1009
1010	add		@t[0],sp,#4*(8)
1011	vst1.8		{$a1-$b1},[sp]
1012	add		@t[2],sp,#4*(0)
1013	vst1.8		{$c1-$d1},[@t[0]]
1014	sub		@t[3],@t[3],#64*1	@ len-=64*1
1015	b		.Loop_tail_neon
1016
1017.align	4
1018.L128_or_more_neon:
1019	vld1.8		{$t0-$t1},[r12]!
1020	vld1.8		{$t2-$t3},[r12]!
1021	veor		$a0,$a0,$t0
1022	veor		$b0,$b0,$t1
1023	vld1.8		{$t0-$t1},[r12]!
1024	veor		$c0,$c0,$t2
1025	veor		$d0,$d0,$t3
1026	vld1.8		{$t2-$t3},[r12]!
1027
1028	veor		$a1,$a1,$t0
1029	veor		$b1,$b1,$t1
1030	 vst1.8		{$a0-$b0},[r14]!
1031	veor		$c1,$c1,$t2
1032	 vst1.8		{$c0-$d0},[r14]!
1033	veor		$d1,$d1,$t3
1034	vst1.8		{$a1-$b1},[r14]!
1035	vst1.8		{$c1-$d1},[r14]!
1036
1037	beq		.Ldone_neon
1038
1039	add		@t[0],sp,#4*(8)
1040	vst1.8		{$a2-$b2},[sp]
1041	add		@t[2],sp,#4*(0)
1042	vst1.8		{$c2-$d2},[@t[0]]
1043	sub		@t[3],@t[3],#64*2	@ len-=64*2
1044	b		.Loop_tail_neon
1045
1046.align	4
1047.L192_or_more_neon:
1048	vld1.8		{$t0-$t1},[r12]!
1049	vld1.8		{$t2-$t3},[r12]!
1050	veor		$a0,$a0,$t0
1051	veor		$b0,$b0,$t1
1052	vld1.8		{$t0-$t1},[r12]!
1053	veor		$c0,$c0,$t2
1054	veor		$d0,$d0,$t3
1055	vld1.8		{$t2-$t3},[r12]!
1056
1057	veor		$a1,$a1,$t0
1058	veor		$b1,$b1,$t1
1059	vld1.8		{$t0-$t1},[r12]!
1060	veor		$c1,$c1,$t2
1061	 vst1.8		{$a0-$b0},[r14]!
1062	veor		$d1,$d1,$t3
1063	vld1.8		{$t2-$t3},[r12]!
1064
1065	veor		$a2,$a2,$t0
1066	 vst1.8		{$c0-$d0},[r14]!
1067	veor		$b2,$b2,$t1
1068	 vst1.8		{$a1-$b1},[r14]!
1069	veor		$c2,$c2,$t2
1070	 vst1.8		{$c1-$d1},[r14]!
1071	veor		$d2,$d2,$t3
1072	vst1.8		{$a2-$b2},[r14]!
1073	vst1.8		{$c2-$d2},[r14]!
1074
1075	beq		.Ldone_neon
1076
1077	ldmia		sp,{@t[0]-@t[3]}	@ load key material
1078	add		@x[0],@x[0],@t[0]	@ accumulate key material
1079	 add		@t[0],sp,#4*(4)
1080	add		@x[1],@x[1],@t[1]
1081	add		@x[2],@x[2],@t[2]
1082	add		@x[3],@x[3],@t[3]
1083	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1084
1085	add		@x[4],@x[4],@t[0]	@ accumulate key material
1086	 add		@t[0],sp,#4*(8)
1087	add		@x[5],@x[5],@t[1]
1088	add		@x[6],@x[6],@t[2]
1089	add		@x[7],@x[7],@t[3]
1090	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1091# ifdef	__ARMEB__
1092	rev		@x[0],@x[0]
1093	rev		@x[1],@x[1]
1094	rev		@x[2],@x[2]
1095	rev		@x[3],@x[3]
1096	rev		@x[4],@x[4]
1097	rev		@x[5],@x[5]
1098	rev		@x[6],@x[6]
1099	rev		@x[7],@x[7]
1100# endif
1101	stmia		sp,{@x[0]-@x[7]}
1102	 add		@x[0],sp,#4*(16+8)
1103
1104	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
1105
1106	add		@x[0],@x[0],@t[0]	@ accumulate key material
1107	 add		@t[0],sp,#4*(12)
1108	add		@x[1],@x[1],@t[1]
1109	add		@x[2],@x[2],@t[2]
1110	add		@x[3],@x[3],@t[3]
1111	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1112
1113	add		@x[4],@x[4],@t[0]	@ accumulate key material
1114	 add		@t[0],sp,#4*(8)
1115	add		@x[5],@x[5],@t[1]
1116	 add		@x[4],@x[4],#3		@ counter+3
1117	add		@x[6],@x[6],@t[2]
1118	add		@x[7],@x[7],@t[3]
1119	 ldr		@t[3],[sp,#4*(32+2)]	@ re-load len
1120# ifdef	__ARMEB__
1121	rev		@x[0],@x[0]
1122	rev		@x[1],@x[1]
1123	rev		@x[2],@x[2]
1124	rev		@x[3],@x[3]
1125	rev		@x[4],@x[4]
1126	rev		@x[5],@x[5]
1127	rev		@x[6],@x[6]
1128	rev		@x[7],@x[7]
1129# endif
1130	stmia		@t[0],{@x[0]-@x[7]}
1131	 add		@t[2],sp,#4*(0)
1132	 sub		@t[3],@t[3],#64*3	@ len-=64*3
1133
1134.Loop_tail_neon:
1135	ldrb		@t[0],[@t[2]],#1	@ read buffer on stack
1136	ldrb		@t[1],[r12],#1		@ read input
1137	subs		@t[3],@t[3],#1
1138	eor		@t[0],@t[0],@t[1]
1139	strb		@t[0],[r14],#1		@ store output
1140	bne		.Loop_tail_neon
1141
1142.Ldone_neon:
1143	add		sp,sp,#4*(32+4)
1144	vldmia		sp,{d8-d15}
1145	add		sp,sp,#4*(16+3)
1146	ldmia		sp!,{r4-r11,pc}
1147.size	ChaCha20_neon,.-ChaCha20_neon
1148.comm	OPENSSL_armcap_P,4,4
1149#endif
1150___
1151}}}
1152
1153foreach (split("\n",$code)) {
1154	s/\`([^\`]*)\`/eval $1/geo;
1155
1156	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1157
1158	print $_,"\n";
1159}
1160close STDOUT or die "error closing STDOUT: $!";
1161