1#! /usr/bin/env perl
2# Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2015
18#
19# "Teaser" Montgomery multiplication module for ARMv8. Needs more
20# work. While it does improve RSA sign performance by 20-30% (less for
21# longer keys) on most processors, for some reason RSA2048 is not
22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23# instruction issue rate is limited on processor in question, meaning
24# that dedicated squaring procedure is a must. Well, actually all
25# contemporary AArch64 processors seem to have limited multiplication
26# issue rate, i.e. they can't issue multiplication every cycle, which
27# explains moderate improvement coefficients in comparison to
28# compiler-generated code. Recall that compiler is instructed to use
29# umulh and therefore uses same amount of multiplication instructions
30# to do the job. Assembly's edge is to minimize number of "collateral"
31# instructions and of course instruction scheduling.
32#
33# April 2015
34#
35# Squaring procedure that handles lengths divisible by 8 improves
36# RSA/DSA performance by 25-40-60% depending on processor and key
37# length. Overall improvement coefficients are always positive in
38# comparison to compiler-generated code. On Cortex-A57 improvement
39# is still modest on longest key lengths, while others exhibit e.g.
40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41# on Cortex-A57 and ~60-100% faster on others.
42
43# $output is the last argument if it looks like a file (it has an extension)
44# $flavour is the first argument if it doesn't look like a file
45my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
51die "can't locate arm-xlate.pl";
52
53open OUT,"| \"$^X\" $xlate $flavour \"$output\""
54    or die "can't call $xlate: $1";
55*STDOUT=*OUT;
56
57($lo0,$hi0,$aj,$m0,$alo,$ahi,
58 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
59 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
60
61# int bn_mul_mont(
62$rp="x0";	# BN_ULONG *rp,
63$ap="x1";	# const BN_ULONG *ap,
64$bp="x2";	# const BN_ULONG *bp,
65$np="x3";	# const BN_ULONG *np,
66$n0="x4";	# const BN_ULONG *n0,
67$num="x5";	# int num);
68
69$code.=<<___;
70#ifndef	__KERNEL__
71# include "arm_arch.h"
72.extern OPENSSL_armv8_rsa_neonized
73.hidden OPENSSL_armv8_rsa_neonized
74#endif
75.text
76
77.globl	bn_mul_mont
78.type	bn_mul_mont,%function
79.align	5
80bn_mul_mont:
81.Lbn_mul_mont:
82	tst	$num,#3
83	b.ne	.Lmul_mont
84	cmp	$num,#32
85	b.le	.Lscalar_impl
86#ifndef	__KERNEL__
87	adrp	x17,OPENSSL_armv8_rsa_neonized
88	ldr	w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
89	cbnz	w17, bn_mul8x_mont_neon
90#endif
91
92.Lscalar_impl:
93	tst	$num,#7
94	b.eq	__bn_sqr8x_mont
95	tst	$num,#3
96	b.eq	__bn_mul4x_mont
97
98.Lmul_mont:
99	stp	x29,x30,[sp,#-64]!
100	add	x29,sp,#0
101	stp	x19,x20,[sp,#16]
102	stp	x21,x22,[sp,#32]
103	stp	x23,x24,[sp,#48]
104
105	ldr	$m0,[$bp],#8		// bp[0]
106	sub	$tp,sp,$num,lsl#3
107	ldp	$hi0,$aj,[$ap],#16	// ap[0..1]
108	lsl	$num,$num,#3
109	ldr	$n0,[$n0]		// *n0
110	and	$tp,$tp,#-16		// ABI says so
111	ldp	$hi1,$nj,[$np],#16	// np[0..1]
112
113	mul	$lo0,$hi0,$m0		// ap[0]*bp[0]
114	sub	$j,$num,#16		// j=num-2
115	umulh	$hi0,$hi0,$m0
116	mul	$alo,$aj,$m0		// ap[1]*bp[0]
117	umulh	$ahi,$aj,$m0
118
119	mul	$m1,$lo0,$n0		// "tp[0]"*n0
120	mov	sp,$tp			// alloca
121
122	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
123	umulh	$hi1,$hi1,$m1
124	mul	$nlo,$nj,$m1		// np[1]*m1
125	// (*)	adds	$lo1,$lo1,$lo0	// discarded
126	// (*)	As for removal of first multiplication and addition
127	//	instructions. The outcome of first addition is
128	//	guaranteed to be zero, which leaves two computationally
129	//	significant outcomes: it either carries or not. Then
130	//	question is when does it carry? Is there alternative
131	//	way to deduce it? If you follow operations, you can
132	//	observe that condition for carry is quite simple:
133	//	$lo0 being non-zero. So that carry can be calculated
134	//	by adding -1 to $lo0. That's what next instruction does.
135	subs	xzr,$lo0,#1		// (*)
136	umulh	$nhi,$nj,$m1
137	adc	$hi1,$hi1,xzr
138	cbz	$j,.L1st_skip
139
140.L1st:
141	ldr	$aj,[$ap],#8
142	adds	$lo0,$alo,$hi0
143	sub	$j,$j,#8		// j--
144	adc	$hi0,$ahi,xzr
145
146	ldr	$nj,[$np],#8
147	adds	$lo1,$nlo,$hi1
148	mul	$alo,$aj,$m0		// ap[j]*bp[0]
149	adc	$hi1,$nhi,xzr
150	umulh	$ahi,$aj,$m0
151
152	adds	$lo1,$lo1,$lo0
153	mul	$nlo,$nj,$m1		// np[j]*m1
154	adc	$hi1,$hi1,xzr
155	umulh	$nhi,$nj,$m1
156	str	$lo1,[$tp],#8		// tp[j-1]
157	cbnz	$j,.L1st
158
159.L1st_skip:
160	adds	$lo0,$alo,$hi0
161	sub	$ap,$ap,$num		// rewind $ap
162	adc	$hi0,$ahi,xzr
163
164	adds	$lo1,$nlo,$hi1
165	sub	$np,$np,$num		// rewind $np
166	adc	$hi1,$nhi,xzr
167
168	adds	$lo1,$lo1,$lo0
169	sub	$i,$num,#8		// i=num-1
170	adcs	$hi1,$hi1,$hi0
171
172	adc	$ovf,xzr,xzr		// upmost overflow bit
173	stp	$lo1,$hi1,[$tp]
174
175.Louter:
176	ldr	$m0,[$bp],#8		// bp[i]
177	ldp	$hi0,$aj,[$ap],#16
178	ldr	$tj,[sp]		// tp[0]
179	add	$tp,sp,#8
180
181	mul	$lo0,$hi0,$m0		// ap[0]*bp[i]
182	sub	$j,$num,#16		// j=num-2
183	umulh	$hi0,$hi0,$m0
184	ldp	$hi1,$nj,[$np],#16
185	mul	$alo,$aj,$m0		// ap[1]*bp[i]
186	adds	$lo0,$lo0,$tj
187	umulh	$ahi,$aj,$m0
188	adc	$hi0,$hi0,xzr
189
190	mul	$m1,$lo0,$n0
191	sub	$i,$i,#8		// i--
192
193	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
194	umulh	$hi1,$hi1,$m1
195	mul	$nlo,$nj,$m1		// np[1]*m1
196	// (*)	adds	$lo1,$lo1,$lo0
197	subs	xzr,$lo0,#1		// (*)
198	umulh	$nhi,$nj,$m1
199	cbz	$j,.Linner_skip
200
201.Linner:
202	ldr	$aj,[$ap],#8
203	adc	$hi1,$hi1,xzr
204	ldr	$tj,[$tp],#8		// tp[j]
205	adds	$lo0,$alo,$hi0
206	sub	$j,$j,#8		// j--
207	adc	$hi0,$ahi,xzr
208
209	adds	$lo1,$nlo,$hi1
210	ldr	$nj,[$np],#8
211	adc	$hi1,$nhi,xzr
212
213	mul	$alo,$aj,$m0		// ap[j]*bp[i]
214	adds	$lo0,$lo0,$tj
215	umulh	$ahi,$aj,$m0
216	adc	$hi0,$hi0,xzr
217
218	mul	$nlo,$nj,$m1		// np[j]*m1
219	adds	$lo1,$lo1,$lo0
220	umulh	$nhi,$nj,$m1
221	stur	$lo1,[$tp,#-16]		// tp[j-1]
222	cbnz	$j,.Linner
223
224.Linner_skip:
225	ldr	$tj,[$tp],#8		// tp[j]
226	adc	$hi1,$hi1,xzr
227	adds	$lo0,$alo,$hi0
228	sub	$ap,$ap,$num		// rewind $ap
229	adc	$hi0,$ahi,xzr
230
231	adds	$lo1,$nlo,$hi1
232	sub	$np,$np,$num		// rewind $np
233	adcs	$hi1,$nhi,$ovf
234	adc	$ovf,xzr,xzr
235
236	adds	$lo0,$lo0,$tj
237	adc	$hi0,$hi0,xzr
238
239	adds	$lo1,$lo1,$lo0
240	adcs	$hi1,$hi1,$hi0
241	adc	$ovf,$ovf,xzr		// upmost overflow bit
242	stp	$lo1,$hi1,[$tp,#-16]
243
244	cbnz	$i,.Louter
245
246	// Final step. We see if result is larger than modulus, and
247	// if it is, subtract the modulus. But comparison implies
248	// subtraction. So we subtract modulus, see if it borrowed,
249	// and conditionally copy original value.
250	ldr	$tj,[sp]		// tp[0]
251	add	$tp,sp,#8
252	ldr	$nj,[$np],#8		// np[0]
253	subs	$j,$num,#8		// j=num-1 and clear borrow
254	mov	$ap,$rp
255.Lsub:
256	sbcs	$aj,$tj,$nj		// tp[j]-np[j]
257	ldr	$tj,[$tp],#8
258	sub	$j,$j,#8		// j--
259	ldr	$nj,[$np],#8
260	str	$aj,[$ap],#8		// rp[j]=tp[j]-np[j]
261	cbnz	$j,.Lsub
262
263	sbcs	$aj,$tj,$nj
264	sbcs	$ovf,$ovf,xzr		// did it borrow?
265	str	$aj,[$ap],#8		// rp[num-1]
266
267	ldr	$tj,[sp]		// tp[0]
268	add	$tp,sp,#8
269	ldr	$aj,[$rp],#8		// rp[0]
270	sub	$num,$num,#8		// num--
271	nop
272.Lcond_copy:
273	sub	$num,$num,#8		// num--
274	csel	$nj,$tj,$aj,lo		// did it borrow?
275	ldr	$tj,[$tp],#8
276	ldr	$aj,[$rp],#8
277	stur	xzr,[$tp,#-16]		// wipe tp
278	stur	$nj,[$rp,#-16]
279	cbnz	$num,.Lcond_copy
280
281	csel	$nj,$tj,$aj,lo
282	stur	xzr,[$tp,#-8]		// wipe tp
283	stur	$nj,[$rp,#-8]
284
285	ldp	x19,x20,[x29,#16]
286	mov	sp,x29
287	ldp	x21,x22,[x29,#32]
288	mov	x0,#1
289	ldp	x23,x24,[x29,#48]
290	ldr	x29,[sp],#64
291	ret
292.size	bn_mul_mont,.-bn_mul_mont
293___
294{
295my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
296my ($Z,$Temp)=("v4.16b","v5");
297my @ACC=map("v$_",(6..13));
298my ($Bi,$Ni,$M0)=map("v$_",(28..30));
299my $sBi="s28";
300my $sM0="s30";
301my $zero="v14";
302my $temp="v15";
303my $ACCTemp="v16";
304
305my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
306my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
307
308$code.=<<___;
309.type	bn_mul8x_mont_neon,%function
310.align	5
311bn_mul8x_mont_neon:
312	stp	x29,x30,[sp,#-80]!
313	mov	x16,sp
314	stp	d8,d9,[sp,#16]
315	stp	d10,d11,[sp,#32]
316	stp	d12,d13,[sp,#48]
317	stp	d14,d15,[sp,#64]
318	lsl	$num,$num,#1
319	eor	$zero.16b,$zero.16b,$zero.16b
320
321.align	4
322.LNEON_8n:
323	eor	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b
324	sub	$toutptr,sp,#128
325	eor	@ACC[1].16b,@ACC[1].16b,@ACC[1].16b
326	sub	$toutptr,$toutptr,$num,lsl#4
327	eor	@ACC[2].16b,@ACC[2].16b,@ACC[2].16b
328	and	$toutptr,$toutptr,#-64
329	eor	@ACC[3].16b,@ACC[3].16b,@ACC[3].16b
330	mov	sp,$toutptr		// alloca
331	eor	@ACC[4].16b,@ACC[4].16b,@ACC[4].16b
332	add	$toutptr,$toutptr,#256
333	eor	@ACC[5].16b,@ACC[5].16b,@ACC[5].16b
334	sub	$inner,$num,#8
335	eor	@ACC[6].16b,@ACC[6].16b,@ACC[6].16b
336	eor	@ACC[7].16b,@ACC[7].16b,@ACC[7].16b
337
338.LNEON_8n_init:
339	st1	{@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
340	subs	$inner,$inner,#8
341	st1	{@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
342	st1	{@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
343	st1	{@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
344	bne	.LNEON_8n_init
345
346	add	$tinptr,sp,#256
347	ld1	{$A0.4s,$A1.4s},[$aptr],#32
348	add	$bnptr,sp,#8
349	ldr	$sM0,[$n0],#4
350	mov	$outer,$num
351	b	.LNEON_8n_outer
352
353.align	4
354.LNEON_8n_outer:
355	ldr	$sBi,[$bptr],#4   // *b++
356	uxtl	$Bi.4s,$Bi.4h
357	add	$toutptr,sp,#128
358	ld1	{$N0.4s,$N1.4s},[$nptr],#32
359
360	umlal	@ACC[0].2d,$Bi.2s,$A0.s[0]
361	umlal	@ACC[1].2d,$Bi.2s,$A0.s[1]
362	umlal	@ACC[2].2d,$Bi.2s,$A0.s[2]
363	shl	$Ni.2d,@ACC[0].2d,#16
364	ext	$Ni.16b,$Ni.16b,$Ni.16b,#8
365	umlal	@ACC[3].2d,$Bi.2s,$A0.s[3]
366	add	$Ni.2d,$Ni.2d,@ACC[0].2d
367	umlal	@ACC[4].2d,$Bi.2s,$A1.s[0]
368	mul	$Ni.2s,$Ni.2s,$M0.2s
369	umlal	@ACC[5].2d,$Bi.2s,$A1.s[1]
370	st1	{$Bi.2s},[sp]		// put aside smashed b[8*i+0]
371	umlal	@ACC[6].2d,$Bi.2s,$A1.s[2]
372	uxtl	$Ni.4s,$Ni.4h
373	umlal	@ACC[7].2d,$Bi.2s,$A1.s[3]
374___
375for ($i=0; $i<7;) {
376$code.=<<___;
377	ldr	$sBi,[$bptr],#4   // *b++
378	umlal	@ACC[0].2d,$Ni.2s,$N0.s[0]
379	umlal	@ACC[1].2d,$Ni.2s,$N0.s[1]
380	uxtl	$Bi.4s,$Bi.4h
381	umlal	@ACC[2].2d,$Ni.2s,$N0.s[2]
382	ushr	$temp.2d,@ACC[0].2d,#16
383	umlal	@ACC[3].2d,$Ni.2s,$N0.s[3]
384	umlal	@ACC[4].2d,$Ni.2s,$N1.s[0]
385	ext	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
386	add	@ACC[0].2d,@ACC[0].2d,$temp.2d
387	umlal	@ACC[5].2d,$Ni.2s,$N1.s[1]
388	ushr	@ACC[0].2d,@ACC[0].2d,#16
389	umlal	@ACC[6].2d,$Ni.2s,$N1.s[2]
390	umlal	@ACC[7].2d,$Ni.2s,$N1.s[3]
391	add	$ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
392	ins	@ACC[1].d[0],$ACCTemp.d[0]
393	st1	{$Ni.2s},[$bnptr],#8	// put aside smashed m[8*i+$i]
394___
395	push(@ACC,shift(@ACC));	$i++;
396$code.=<<___;
397	umlal	@ACC[0].2d,$Bi.2s,$A0.s[0]
398	ld1	{@ACC[7].2d},[$tinptr],#16
399	umlal	@ACC[1].2d,$Bi.2s,$A0.s[1]
400	umlal	@ACC[2].2d,$Bi.2s,$A0.s[2]
401	shl	$Ni.2d,@ACC[0].2d,#16
402	ext	$Ni.16b,$Ni.16b,$Ni.16b,#8
403	umlal	@ACC[3].2d,$Bi.2s,$A0.s[3]
404	add	$Ni.2d,$Ni.2d,@ACC[0].2d
405	umlal	@ACC[4].2d,$Bi.2s,$A1.s[0]
406	mul	$Ni.2s,$Ni.2s,$M0.2s
407	umlal	@ACC[5].2d,$Bi.2s,$A1.s[1]
408	st1	{$Bi.2s},[$bnptr],#8	// put aside smashed b[8*i+$i]
409	umlal	@ACC[6].2d,$Bi.2s,$A1.s[2]
410	uxtl	$Ni.4s,$Ni.4h
411	umlal	@ACC[7].2d,$Bi.2s,$A1.s[3]
412___
413}
414$code.=<<___;
415	ld1	{$Bi.2s},[sp]		// pull smashed b[8*i+0]
416	umlal	@ACC[0].2d,$Ni.2s,$N0.s[0]
417	ld1	{$A0.4s,$A1.4s},[$aptr],#32
418	umlal	@ACC[1].2d,$Ni.2s,$N0.s[1]
419	umlal	@ACC[2].2d,$Ni.2s,$N0.s[2]
420	mov	$Temp.16b,@ACC[0].16b
421	ushr	$Temp.2d,$Temp.2d,#16
422	ext	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
423	umlal	@ACC[3].2d,$Ni.2s,$N0.s[3]
424	umlal	@ACC[4].2d,$Ni.2s,$N1.s[0]
425	add	@ACC[0].2d,@ACC[0].2d,$Temp.2d
426	umlal	@ACC[5].2d,$Ni.2s,$N1.s[1]
427	ushr	@ACC[0].2d,@ACC[0].2d,#16
428	eor	$temp.16b,$temp.16b,$temp.16b
429	ins	@ACC[0].d[1],$temp.d[0]
430	umlal	@ACC[6].2d,$Ni.2s,$N1.s[2]
431	umlal	@ACC[7].2d,$Ni.2s,$N1.s[3]
432	add	@ACC[1].2d,@ACC[1].2d,@ACC[0].2d
433	st1	{$Ni.2s},[$bnptr],#8	// put aside smashed m[8*i+$i]
434	add	$bnptr,sp,#8		// rewind
435___
436	push(@ACC,shift(@ACC));
437$code.=<<___;
438	sub	$inner,$num,#8
439	b	.LNEON_8n_inner
440
441.align	4
442.LNEON_8n_inner:
443	subs	$inner,$inner,#8
444	umlal	@ACC[0].2d,$Bi.2s,$A0.s[0]
445	ld1	{@ACC[7].2d},[$tinptr]
446	umlal	@ACC[1].2d,$Bi.2s,$A0.s[1]
447	ld1	{$Ni.2s},[$bnptr],#8	// pull smashed m[8*i+0]
448	umlal	@ACC[2].2d,$Bi.2s,$A0.s[2]
449	ld1	{$N0.4s,$N1.4s},[$nptr],#32
450	umlal	@ACC[3].2d,$Bi.2s,$A0.s[3]
451	b.eq	.LInner_jump
452	add	$tinptr,$tinptr,#16	// don't advance in last iteration
453.LInner_jump:
454	umlal	@ACC[4].2d,$Bi.2s,$A1.s[0]
455	umlal	@ACC[5].2d,$Bi.2s,$A1.s[1]
456	umlal	@ACC[6].2d,$Bi.2s,$A1.s[2]
457	umlal	@ACC[7].2d,$Bi.2s,$A1.s[3]
458___
459for ($i=1; $i<8; $i++) {
460$code.=<<___;
461	ld1	{$Bi.2s},[$bnptr],#8	// pull smashed b[8*i+$i]
462	umlal	@ACC[0].2d,$Ni.2s,$N0.s[0]
463	umlal	@ACC[1].2d,$Ni.2s,$N0.s[1]
464	umlal	@ACC[2].2d,$Ni.2s,$N0.s[2]
465	umlal	@ACC[3].2d,$Ni.2s,$N0.s[3]
466	umlal	@ACC[4].2d,$Ni.2s,$N1.s[0]
467	umlal	@ACC[5].2d,$Ni.2s,$N1.s[1]
468	umlal	@ACC[6].2d,$Ni.2s,$N1.s[2]
469	umlal	@ACC[7].2d,$Ni.2s,$N1.s[3]
470	st1	{@ACC[0].2d},[$toutptr],#16
471___
472	push(@ACC,shift(@ACC));
473$code.=<<___;
474	umlal	@ACC[0].2d,$Bi.2s,$A0.s[0]
475	ld1	{@ACC[7].2d},[$tinptr]
476	umlal	@ACC[1].2d,$Bi.2s,$A0.s[1]
477	ld1	{$Ni.2s},[$bnptr],#8	// pull smashed m[8*i+$i]
478	umlal	@ACC[2].2d,$Bi.2s,$A0.s[2]
479	b.eq	.LInner_jump$i
480	add	$tinptr,$tinptr,#16	// don't advance in last iteration
481.LInner_jump$i:
482	umlal	@ACC[3].2d,$Bi.2s,$A0.s[3]
483	umlal	@ACC[4].2d,$Bi.2s,$A1.s[0]
484	umlal	@ACC[5].2d,$Bi.2s,$A1.s[1]
485	umlal	@ACC[6].2d,$Bi.2s,$A1.s[2]
486	umlal	@ACC[7].2d,$Bi.2s,$A1.s[3]
487___
488}
489$code.=<<___;
490	b.ne	.LInner_after_rewind$i
491	sub	$aptr,$aptr,$num,lsl#2	// rewind
492.LInner_after_rewind$i:
493	umlal	@ACC[0].2d,$Ni.2s,$N0.s[0]
494	ld1	{$Bi.2s},[sp]		// pull smashed b[8*i+0]
495	umlal	@ACC[1].2d,$Ni.2s,$N0.s[1]
496	ld1	{$A0.4s,$A1.4s},[$aptr],#32
497	umlal	@ACC[2].2d,$Ni.2s,$N0.s[2]
498	add	$bnptr,sp,#8		// rewind
499	umlal	@ACC[3].2d,$Ni.2s,$N0.s[3]
500	umlal	@ACC[4].2d,$Ni.2s,$N1.s[0]
501	umlal	@ACC[5].2d,$Ni.2s,$N1.s[1]
502	umlal	@ACC[6].2d,$Ni.2s,$N1.s[2]
503	st1	{@ACC[0].2d},[$toutptr],#16
504	umlal	@ACC[7].2d,$Ni.2s,$N1.s[3]
505
506	bne	.LNEON_8n_inner
507___
508	push(@ACC,shift(@ACC));
509$code.=<<___;
510	add	$tinptr,sp,#128
511	st1	{@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
512	eor	$N0.16b,$N0.16b,$N0.16b	// $N0
513	st1	{@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
514	eor	$N1.16b,$N1.16b,$N1.16b	// $N1
515	st1	{@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
516	st1	{@ACC[6].2d},[$toutptr]
517
518	subs	$outer,$outer,#8
519	ld1	{@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
520	ld1	{@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
521	ld1	{@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
522	ld1	{@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
523
524	b.eq	.LInner_8n_jump_2steps
525	sub	$nptr,$nptr,$num,lsl#2	// rewind
526	b	.LNEON_8n_outer
527
528.LInner_8n_jump_2steps:
529	add	$toutptr,sp,#128
530	st1	{$N0.2d,$N1.2d}, [sp],#32	// start wiping stack frame
531	mov	$Temp.16b,@ACC[0].16b
532	ushr	$temp.2d,@ACC[0].2d,#16
533	ext	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
534	st1	{$N0.2d,$N1.2d}, [sp],#32
535	add	@ACC[0].2d,@ACC[0].2d,$temp.2d
536	st1	{$N0.2d,$N1.2d}, [sp],#32
537	ushr	$temp.2d,@ACC[0].2d,#16
538	st1	{$N0.2d,$N1.2d}, [sp],#32
539	zip1	@ACC[0].4h,$Temp.4h,@ACC[0].4h
540	ins	$temp.d[1],$zero.d[0]
541
542	mov	$inner,$num
543	b	.LNEON_tail_entry
544
545.align	4
546.LNEON_tail:
547	add	@ACC[0].2d,@ACC[0].2d,$temp.2d
548	mov	$Temp.16b,@ACC[0].16b
549	ushr	$temp.2d,@ACC[0].2d,#16
550	ext	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
551	ld1	{@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
552	add	@ACC[0].2d,@ACC[0].2d,$temp.2d
553	ld1	{@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
554	ushr	$temp.2d,@ACC[0].2d,#16
555	ld1	{@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
556	zip1	@ACC[0].4h,$Temp.4h,@ACC[0].4h
557	ins	$temp.d[1],$zero.d[0]
558
559.LNEON_tail_entry:
560___
561for ($i=1; $i<8; $i++) {
562$code.=<<___;
563	add	@ACC[1].2d,@ACC[1].2d,$temp.2d
564	st1	{@ACC[0].s}[0], [$toutptr],#4
565	ushr	$temp.2d,@ACC[1].2d,#16
566	mov	$Temp.16b,@ACC[1].16b
567	ext	@ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
568	add	@ACC[1].2d,@ACC[1].2d,$temp.2d
569	ushr	$temp.2d,@ACC[1].2d,#16
570	zip1	@ACC[1].4h,$Temp.4h,@ACC[1].4h
571	ins	$temp.d[1],$zero.d[0]
572___
573	push(@ACC,shift(@ACC));
574}
575	push(@ACC,shift(@ACC));
576$code.=<<___;
577	ld1	{@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
578	subs	$inner,$inner,#8
579	st1	{@ACC[7].s}[0], [$toutptr],#4
580	bne	.LNEON_tail
581
582	st1	{$temp.s}[0], [$toutptr],#4	// top-most bit
583	sub	$nptr,$nptr,$num,lsl#2		// rewind $nptr
584	subs	$aptr,sp,#0			// clear carry flag
585	add	$bptr,sp,$num,lsl#2
586
587.LNEON_sub:
588	ldp	w4,w5,[$aptr],#8
589	ldp	w6,w7,[$aptr],#8
590	ldp	w8,w9,[$nptr],#8
591	ldp	w10,w11,[$nptr],#8
592	sbcs	w8,w4,w8
593	sbcs	w9,w5,w9
594	sbcs	w10,w6,w10
595	sbcs	w11,w7,w11
596	sub	x17,$bptr,$aptr
597	stp	w8,w9,[$rptr],#8
598	stp	w10,w11,[$rptr],#8
599	cbnz	x17,.LNEON_sub
600
601	ldr	w10, [$aptr]		// load top-most bit
602	mov	x11,sp
603	eor	v0.16b,v0.16b,v0.16b
604	sub	x11,$bptr,x11		// this is num*4
605	eor	v1.16b,v1.16b,v1.16b
606	mov	$aptr,sp
607	sub	$rptr,$rptr,x11		// rewind $rptr
608	mov	$nptr,$bptr		// second 3/4th of frame
609	sbcs	w10,w10,wzr		// result is carry flag
610
611.LNEON_copy_n_zap:
612	ldp	w4,w5,[$aptr],#8
613	ldp	w6,w7,[$aptr],#8
614	ldp	w8,w9,[$rptr],#8
615	ldp	w10,w11,[$rptr]
616	sub	$rptr,$rptr,#8
617	b.cs	.LCopy_1
618	mov	w8,w4
619	mov	w9,w5
620	mov	w10,w6
621	mov	w11,w7
622.LCopy_1:
623	st1	{v0.2d,v1.2d}, [$nptr],#32		// wipe
624	st1	{v0.2d,v1.2d}, [$nptr],#32		// wipe
625	ldp	w4,w5,[$aptr],#8
626	ldp	w6,w7,[$aptr],#8
627	stp	w8,w9,[$rptr],#8
628	stp	w10,w11,[$rptr],#8
629	sub	$aptr,$aptr,#32
630	ldp	w8,w9,[$rptr],#8
631	ldp	w10,w11,[$rptr]
632	sub	$rptr,$rptr,#8
633	b.cs	.LCopy_2
634	mov	w8, w4
635	mov	w9, w5
636	mov	w10, w6
637	mov	w11, w7
638.LCopy_2:
639	st1	{v0.2d,v1.2d}, [$aptr],#32		// wipe
640	st1	{v0.2d,v1.2d}, [$nptr],#32		// wipe
641	sub	x17,$bptr,$aptr		// preserves carry
642	stp	w8,w9,[$rptr],#8
643	stp	w10,w11,[$rptr],#8
644	cbnz	x17,.LNEON_copy_n_zap
645
646	mov	sp,x16
647	ldp	d14,d15,[sp,#64]
648	ldp	d12,d13,[sp,#48]
649	ldp	d10,d11,[sp,#32]
650	ldp	d8,d9,[sp,#16]
651	ldr	x29,[sp],#80
652	ret			// bx lr
653
654.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
655___
656}
657{
658########################################################################
659# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
660
661my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
662my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
663my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
664my ($cnt,$carry,$topmost)=("x27","x28","x30");
665my ($tp,$ap_end,$na0)=($bp,$np,$carry);
666
667$code.=<<___;
668.type	__bn_sqr8x_mont,%function
669.align	5
670__bn_sqr8x_mont:
671	cmp	$ap,$bp
672	b.ne	__bn_mul4x_mont
673.Lsqr8x_mont:
674	.inst	0xd503233f		// paciasp
675	stp	x29,x30,[sp,#-128]!
676	add	x29,sp,#0
677	stp	x19,x20,[sp,#16]
678	stp	x21,x22,[sp,#32]
679	stp	x23,x24,[sp,#48]
680	stp	x25,x26,[sp,#64]
681	stp	x27,x28,[sp,#80]
682	stp	$rp,$np,[sp,#96]	// offload rp and np
683
684	ldp	$a0,$a1,[$ap,#8*0]
685	ldp	$a2,$a3,[$ap,#8*2]
686	ldp	$a4,$a5,[$ap,#8*4]
687	ldp	$a6,$a7,[$ap,#8*6]
688
689	sub	$tp,sp,$num,lsl#4
690	lsl	$num,$num,#3
691	ldr	$n0,[$n0]		// *n0
692	mov	sp,$tp			// alloca
693	sub	$cnt,$num,#8*8
694	b	.Lsqr8x_zero_start
695
696.Lsqr8x_zero:
697	sub	$cnt,$cnt,#8*8
698	stp	xzr,xzr,[$tp,#8*0]
699	stp	xzr,xzr,[$tp,#8*2]
700	stp	xzr,xzr,[$tp,#8*4]
701	stp	xzr,xzr,[$tp,#8*6]
702.Lsqr8x_zero_start:
703	stp	xzr,xzr,[$tp,#8*8]
704	stp	xzr,xzr,[$tp,#8*10]
705	stp	xzr,xzr,[$tp,#8*12]
706	stp	xzr,xzr,[$tp,#8*14]
707	add	$tp,$tp,#8*16
708	cbnz	$cnt,.Lsqr8x_zero
709
710	add	$ap_end,$ap,$num
711	add	$ap,$ap,#8*8
712	mov	$acc0,xzr
713	mov	$acc1,xzr
714	mov	$acc2,xzr
715	mov	$acc3,xzr
716	mov	$acc4,xzr
717	mov	$acc5,xzr
718	mov	$acc6,xzr
719	mov	$acc7,xzr
720	mov	$tp,sp
721	str	$n0,[x29,#112]		// offload n0
722
723	// Multiply everything but a[i]*a[i]
724.align	4
725.Lsqr8x_outer_loop:
726        //                                                 a[1]a[0]	(i)
727        //                                             a[2]a[0]
728        //                                         a[3]a[0]
729        //                                     a[4]a[0]
730        //                                 a[5]a[0]
731        //                             a[6]a[0]
732        //                         a[7]a[0]
733        //                                         a[2]a[1]		(ii)
734        //                                     a[3]a[1]
735        //                                 a[4]a[1]
736        //                             a[5]a[1]
737        //                         a[6]a[1]
738        //                     a[7]a[1]
739        //                                 a[3]a[2]			(iii)
740        //                             a[4]a[2]
741        //                         a[5]a[2]
742        //                     a[6]a[2]
743        //                 a[7]a[2]
744        //                         a[4]a[3]				(iv)
745        //                     a[5]a[3]
746        //                 a[6]a[3]
747        //             a[7]a[3]
748        //                 a[5]a[4]					(v)
749        //             a[6]a[4]
750        //         a[7]a[4]
751        //         a[6]a[5]						(vi)
752        //     a[7]a[5]
753        // a[7]a[6]							(vii)
754
755	mul	$t0,$a1,$a0		// lo(a[1..7]*a[0])		(i)
756	mul	$t1,$a2,$a0
757	mul	$t2,$a3,$a0
758	mul	$t3,$a4,$a0
759	adds	$acc1,$acc1,$t0		// t[1]+lo(a[1]*a[0])
760	mul	$t0,$a5,$a0
761	adcs	$acc2,$acc2,$t1
762	mul	$t1,$a6,$a0
763	adcs	$acc3,$acc3,$t2
764	mul	$t2,$a7,$a0
765	adcs	$acc4,$acc4,$t3
766	umulh	$t3,$a1,$a0		// hi(a[1..7]*a[0])
767	adcs	$acc5,$acc5,$t0
768	umulh	$t0,$a2,$a0
769	adcs	$acc6,$acc6,$t1
770	umulh	$t1,$a3,$a0
771	adcs	$acc7,$acc7,$t2
772	umulh	$t2,$a4,$a0
773	stp	$acc0,$acc1,[$tp],#8*2	// t[0..1]
774	adc	$acc0,xzr,xzr		// t[8]
775	adds	$acc2,$acc2,$t3		// t[2]+lo(a[1]*a[0])
776	umulh	$t3,$a5,$a0
777	adcs	$acc3,$acc3,$t0
778	umulh	$t0,$a6,$a0
779	adcs	$acc4,$acc4,$t1
780	umulh	$t1,$a7,$a0
781	adcs	$acc5,$acc5,$t2
782	 mul	$t2,$a2,$a1		// lo(a[2..7]*a[1])		(ii)
783	adcs	$acc6,$acc6,$t3
784	 mul	$t3,$a3,$a1
785	adcs	$acc7,$acc7,$t0
786	 mul	$t0,$a4,$a1
787	adc	$acc0,$acc0,$t1
788
789	mul	$t1,$a5,$a1
790	adds	$acc3,$acc3,$t2
791	mul	$t2,$a6,$a1
792	adcs	$acc4,$acc4,$t3
793	mul	$t3,$a7,$a1
794	adcs	$acc5,$acc5,$t0
795	umulh	$t0,$a2,$a1		// hi(a[2..7]*a[1])
796	adcs	$acc6,$acc6,$t1
797	umulh	$t1,$a3,$a1
798	adcs	$acc7,$acc7,$t2
799	umulh	$t2,$a4,$a1
800	adcs	$acc0,$acc0,$t3
801	umulh	$t3,$a5,$a1
802	stp	$acc2,$acc3,[$tp],#8*2	// t[2..3]
803	adc	$acc1,xzr,xzr		// t[9]
804	adds	$acc4,$acc4,$t0
805	umulh	$t0,$a6,$a1
806	adcs	$acc5,$acc5,$t1
807	umulh	$t1,$a7,$a1
808	adcs	$acc6,$acc6,$t2
809	 mul	$t2,$a3,$a2		// lo(a[3..7]*a[2])		(iii)
810	adcs	$acc7,$acc7,$t3
811	 mul	$t3,$a4,$a2
812	adcs	$acc0,$acc0,$t0
813	 mul	$t0,$a5,$a2
814	adc	$acc1,$acc1,$t1
815
816	mul	$t1,$a6,$a2
817	adds	$acc5,$acc5,$t2
818	mul	$t2,$a7,$a2
819	adcs	$acc6,$acc6,$t3
820	umulh	$t3,$a3,$a2		// hi(a[3..7]*a[2])
821	adcs	$acc7,$acc7,$t0
822	umulh	$t0,$a4,$a2
823	adcs	$acc0,$acc0,$t1
824	umulh	$t1,$a5,$a2
825	adcs	$acc1,$acc1,$t2
826	umulh	$t2,$a6,$a2
827	stp	$acc4,$acc5,[$tp],#8*2	// t[4..5]
828	adc	$acc2,xzr,xzr		// t[10]
829	adds	$acc6,$acc6,$t3
830	umulh	$t3,$a7,$a2
831	adcs	$acc7,$acc7,$t0
832	 mul	$t0,$a4,$a3		// lo(a[4..7]*a[3])		(iv)
833	adcs	$acc0,$acc0,$t1
834	 mul	$t1,$a5,$a3
835	adcs	$acc1,$acc1,$t2
836	 mul	$t2,$a6,$a3
837	adc	$acc2,$acc2,$t3
838
839	mul	$t3,$a7,$a3
840	adds	$acc7,$acc7,$t0
841	umulh	$t0,$a4,$a3		// hi(a[4..7]*a[3])
842	adcs	$acc0,$acc0,$t1
843	umulh	$t1,$a5,$a3
844	adcs	$acc1,$acc1,$t2
845	umulh	$t2,$a6,$a3
846	adcs	$acc2,$acc2,$t3
847	umulh	$t3,$a7,$a3
848	stp	$acc6,$acc7,[$tp],#8*2	// t[6..7]
849	adc	$acc3,xzr,xzr		// t[11]
850	adds	$acc0,$acc0,$t0
851	 mul	$t0,$a5,$a4		// lo(a[5..7]*a[4])		(v)
852	adcs	$acc1,$acc1,$t1
853	 mul	$t1,$a6,$a4
854	adcs	$acc2,$acc2,$t2
855	 mul	$t2,$a7,$a4
856	adc	$acc3,$acc3,$t3
857
858	umulh	$t3,$a5,$a4		// hi(a[5..7]*a[4])
859	adds	$acc1,$acc1,$t0
860	umulh	$t0,$a6,$a4
861	adcs	$acc2,$acc2,$t1
862	umulh	$t1,$a7,$a4
863	adcs	$acc3,$acc3,$t2
864	 mul	$t2,$a6,$a5		// lo(a[6..7]*a[5])		(vi)
865	adc	$acc4,xzr,xzr		// t[12]
866	adds	$acc2,$acc2,$t3
867	 mul	$t3,$a7,$a5
868	adcs	$acc3,$acc3,$t0
869	 umulh	$t0,$a6,$a5		// hi(a[6..7]*a[5])
870	adc	$acc4,$acc4,$t1
871
872	umulh	$t1,$a7,$a5
873	adds	$acc3,$acc3,$t2
874	 mul	$t2,$a7,$a6		// lo(a[7]*a[6])		(vii)
875	adcs	$acc4,$acc4,$t3
876	 umulh	$t3,$a7,$a6		// hi(a[7]*a[6])
877	adc	$acc5,xzr,xzr		// t[13]
878	adds	$acc4,$acc4,$t0
879	sub	$cnt,$ap_end,$ap	// done yet?
880	adc	$acc5,$acc5,$t1
881
882	adds	$acc5,$acc5,$t2
883	sub	$t0,$ap_end,$num	// rewinded ap
884	adc	$acc6,xzr,xzr		// t[14]
885	add	$acc6,$acc6,$t3
886
887	cbz	$cnt,.Lsqr8x_outer_break
888
889	mov	$n0,$a0
890	ldp	$a0,$a1,[$tp,#8*0]
891	ldp	$a2,$a3,[$tp,#8*2]
892	ldp	$a4,$a5,[$tp,#8*4]
893	ldp	$a6,$a7,[$tp,#8*6]
894	adds	$acc0,$acc0,$a0
895	adcs	$acc1,$acc1,$a1
896	ldp	$a0,$a1,[$ap,#8*0]
897	adcs	$acc2,$acc2,$a2
898	adcs	$acc3,$acc3,$a3
899	ldp	$a2,$a3,[$ap,#8*2]
900	adcs	$acc4,$acc4,$a4
901	adcs	$acc5,$acc5,$a5
902	ldp	$a4,$a5,[$ap,#8*4]
903	adcs	$acc6,$acc6,$a6
904	mov	$rp,$ap
905	adcs	$acc7,xzr,$a7
906	ldp	$a6,$a7,[$ap,#8*6]
907	add	$ap,$ap,#8*8
908	//adc	$carry,xzr,xzr		// moved below
909	mov	$cnt,#-8*8
910
911	//                                                         a[8]a[0]
912	//                                                     a[9]a[0]
913	//                                                 a[a]a[0]
914	//                                             a[b]a[0]
915	//                                         a[c]a[0]
916	//                                     a[d]a[0]
917	//                                 a[e]a[0]
918	//                             a[f]a[0]
919	//                                                     a[8]a[1]
920	//                         a[f]a[1]........................
921	//                                                 a[8]a[2]
922	//                     a[f]a[2]........................
923	//                                             a[8]a[3]
924	//                 a[f]a[3]........................
925	//                                         a[8]a[4]
926	//             a[f]a[4]........................
927	//                                     a[8]a[5]
928	//         a[f]a[5]........................
929	//                                 a[8]a[6]
930	//     a[f]a[6]........................
931	//                             a[8]a[7]
932	// a[f]a[7]........................
933.Lsqr8x_mul:
934	mul	$t0,$a0,$n0
935	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
936	mul	$t1,$a1,$n0
937	add	$cnt,$cnt,#8
938	mul	$t2,$a2,$n0
939	mul	$t3,$a3,$n0
940	adds	$acc0,$acc0,$t0
941	mul	$t0,$a4,$n0
942	adcs	$acc1,$acc1,$t1
943	mul	$t1,$a5,$n0
944	adcs	$acc2,$acc2,$t2
945	mul	$t2,$a6,$n0
946	adcs	$acc3,$acc3,$t3
947	mul	$t3,$a7,$n0
948	adcs	$acc4,$acc4,$t0
949	umulh	$t0,$a0,$n0
950	adcs	$acc5,$acc5,$t1
951	umulh	$t1,$a1,$n0
952	adcs	$acc6,$acc6,$t2
953	umulh	$t2,$a2,$n0
954	adcs	$acc7,$acc7,$t3
955	umulh	$t3,$a3,$n0
956	adc	$carry,$carry,xzr
957	str	$acc0,[$tp],#8
958	adds	$acc0,$acc1,$t0
959	umulh	$t0,$a4,$n0
960	adcs	$acc1,$acc2,$t1
961	umulh	$t1,$a5,$n0
962	adcs	$acc2,$acc3,$t2
963	umulh	$t2,$a6,$n0
964	adcs	$acc3,$acc4,$t3
965	umulh	$t3,$a7,$n0
966	ldr	$n0,[$rp,$cnt]
967	adcs	$acc4,$acc5,$t0
968	adcs	$acc5,$acc6,$t1
969	adcs	$acc6,$acc7,$t2
970	adcs	$acc7,$carry,$t3
971	//adc	$carry,xzr,xzr		// moved above
972	cbnz	$cnt,.Lsqr8x_mul
973					// note that carry flag is guaranteed
974					// to be zero at this point
975	cmp	$ap,$ap_end		// done yet?
976	b.eq	.Lsqr8x_break
977
978	ldp	$a0,$a1,[$tp,#8*0]
979	ldp	$a2,$a3,[$tp,#8*2]
980	ldp	$a4,$a5,[$tp,#8*4]
981	ldp	$a6,$a7,[$tp,#8*6]
982	adds	$acc0,$acc0,$a0
983	ldur	$n0,[$rp,#-8*8]
984	adcs	$acc1,$acc1,$a1
985	ldp	$a0,$a1,[$ap,#8*0]
986	adcs	$acc2,$acc2,$a2
987	adcs	$acc3,$acc3,$a3
988	ldp	$a2,$a3,[$ap,#8*2]
989	adcs	$acc4,$acc4,$a4
990	adcs	$acc5,$acc5,$a5
991	ldp	$a4,$a5,[$ap,#8*4]
992	adcs	$acc6,$acc6,$a6
993	mov	$cnt,#-8*8
994	adcs	$acc7,$acc7,$a7
995	ldp	$a6,$a7,[$ap,#8*6]
996	add	$ap,$ap,#8*8
997	//adc	$carry,xzr,xzr		// moved above
998	b	.Lsqr8x_mul
999
1000.align	4
1001.Lsqr8x_break:
1002	ldp	$a0,$a1,[$rp,#8*0]
1003	add	$ap,$rp,#8*8
1004	ldp	$a2,$a3,[$rp,#8*2]
1005	sub	$t0,$ap_end,$ap		// is it last iteration?
1006	ldp	$a4,$a5,[$rp,#8*4]
1007	sub	$t1,$tp,$t0
1008	ldp	$a6,$a7,[$rp,#8*6]
1009	cbz	$t0,.Lsqr8x_outer_loop
1010
1011	stp	$acc0,$acc1,[$tp,#8*0]
1012	ldp	$acc0,$acc1,[$t1,#8*0]
1013	stp	$acc2,$acc3,[$tp,#8*2]
1014	ldp	$acc2,$acc3,[$t1,#8*2]
1015	stp	$acc4,$acc5,[$tp,#8*4]
1016	ldp	$acc4,$acc5,[$t1,#8*4]
1017	stp	$acc6,$acc7,[$tp,#8*6]
1018	mov	$tp,$t1
1019	ldp	$acc6,$acc7,[$t1,#8*6]
1020	b	.Lsqr8x_outer_loop
1021
1022.align	4
1023.Lsqr8x_outer_break:
1024	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1025	ldp	$a1,$a3,[$t0,#8*0]	// recall that $t0 is &a[0]
1026	ldp	$t1,$t2,[sp,#8*1]
1027	ldp	$a5,$a7,[$t0,#8*2]
1028	add	$ap,$t0,#8*4
1029	ldp	$t3,$t0,[sp,#8*3]
1030
1031	stp	$acc0,$acc1,[$tp,#8*0]
1032	mul	$acc0,$a1,$a1
1033	stp	$acc2,$acc3,[$tp,#8*2]
1034	umulh	$a1,$a1,$a1
1035	stp	$acc4,$acc5,[$tp,#8*4]
1036	mul	$a2,$a3,$a3
1037	stp	$acc6,$acc7,[$tp,#8*6]
1038	mov	$tp,sp
1039	umulh	$a3,$a3,$a3
1040	adds	$acc1,$a1,$t1,lsl#1
1041	extr	$t1,$t2,$t1,#63
1042	sub	$cnt,$num,#8*4
1043
1044.Lsqr4x_shift_n_add:
1045	adcs	$acc2,$a2,$t1
1046	extr	$t2,$t3,$t2,#63
1047	sub	$cnt,$cnt,#8*4
1048	adcs	$acc3,$a3,$t2
1049	ldp	$t1,$t2,[$tp,#8*5]
1050	mul	$a4,$a5,$a5
1051	ldp	$a1,$a3,[$ap],#8*2
1052	umulh	$a5,$a5,$a5
1053	mul	$a6,$a7,$a7
1054	umulh	$a7,$a7,$a7
1055	extr	$t3,$t0,$t3,#63
1056	stp	$acc0,$acc1,[$tp,#8*0]
1057	adcs	$acc4,$a4,$t3
1058	extr	$t0,$t1,$t0,#63
1059	stp	$acc2,$acc3,[$tp,#8*2]
1060	adcs	$acc5,$a5,$t0
1061	ldp	$t3,$t0,[$tp,#8*7]
1062	extr	$t1,$t2,$t1,#63
1063	adcs	$acc6,$a6,$t1
1064	extr	$t2,$t3,$t2,#63
1065	adcs	$acc7,$a7,$t2
1066	ldp	$t1,$t2,[$tp,#8*9]
1067	mul	$a0,$a1,$a1
1068	ldp	$a5,$a7,[$ap],#8*2
1069	umulh	$a1,$a1,$a1
1070	mul	$a2,$a3,$a3
1071	umulh	$a3,$a3,$a3
1072	stp	$acc4,$acc5,[$tp,#8*4]
1073	extr	$t3,$t0,$t3,#63
1074	stp	$acc6,$acc7,[$tp,#8*6]
1075	add	$tp,$tp,#8*8
1076	adcs	$acc0,$a0,$t3
1077	extr	$t0,$t1,$t0,#63
1078	adcs	$acc1,$a1,$t0
1079	ldp	$t3,$t0,[$tp,#8*3]
1080	extr	$t1,$t2,$t1,#63
1081	cbnz	$cnt,.Lsqr4x_shift_n_add
1082___
1083my ($np,$np_end)=($ap,$ap_end);
1084$code.=<<___;
1085	 ldp	$np,$n0,[x29,#104]	// pull np and n0
1086
1087	adcs	$acc2,$a2,$t1
1088	extr	$t2,$t3,$t2,#63
1089	adcs	$acc3,$a3,$t2
1090	ldp	$t1,$t2,[$tp,#8*5]
1091	mul	$a4,$a5,$a5
1092	umulh	$a5,$a5,$a5
1093	stp	$acc0,$acc1,[$tp,#8*0]
1094	mul	$a6,$a7,$a7
1095	umulh	$a7,$a7,$a7
1096	stp	$acc2,$acc3,[$tp,#8*2]
1097	extr	$t3,$t0,$t3,#63
1098	adcs	$acc4,$a4,$t3
1099	extr	$t0,$t1,$t0,#63
1100	 ldp	$acc0,$acc1,[sp,#8*0]
1101	adcs	$acc5,$a5,$t0
1102	extr	$t1,$t2,$t1,#63
1103	 ldp	$a0,$a1,[$np,#8*0]
1104	adcs	$acc6,$a6,$t1
1105	extr	$t2,xzr,$t2,#63
1106	 ldp	$a2,$a3,[$np,#8*2]
1107	adc	$acc7,$a7,$t2
1108	 ldp	$a4,$a5,[$np,#8*4]
1109
1110	// Reduce by 512 bits per iteration
1111	mul	$na0,$n0,$acc0		// t[0]*n0
1112	ldp	$a6,$a7,[$np,#8*6]
1113	add	$np_end,$np,$num
1114	ldp	$acc2,$acc3,[sp,#8*2]
1115	stp	$acc4,$acc5,[$tp,#8*4]
1116	ldp	$acc4,$acc5,[sp,#8*4]
1117	stp	$acc6,$acc7,[$tp,#8*6]
1118	ldp	$acc6,$acc7,[sp,#8*6]
1119	add	$np,$np,#8*8
1120	mov	$topmost,xzr		// initial top-most carry
1121	mov	$tp,sp
1122	mov	$cnt,#8
1123
1124.Lsqr8x_reduction:
1125	// (*)	mul	$t0,$a0,$na0	// lo(n[0-7])*lo(t[0]*n0)
1126	mul	$t1,$a1,$na0
1127	sub	$cnt,$cnt,#1
1128	mul	$t2,$a2,$na0
1129	str	$na0,[$tp],#8		// put aside t[0]*n0 for tail processing
1130	mul	$t3,$a3,$na0
1131	// (*)	adds	xzr,$acc0,$t0
1132	subs	xzr,$acc0,#1		// (*)
1133	mul	$t0,$a4,$na0
1134	adcs	$acc0,$acc1,$t1
1135	mul	$t1,$a5,$na0
1136	adcs	$acc1,$acc2,$t2
1137	mul	$t2,$a6,$na0
1138	adcs	$acc2,$acc3,$t3
1139	mul	$t3,$a7,$na0
1140	adcs	$acc3,$acc4,$t0
1141	umulh	$t0,$a0,$na0		// hi(n[0-7])*lo(t[0]*n0)
1142	adcs	$acc4,$acc5,$t1
1143	umulh	$t1,$a1,$na0
1144	adcs	$acc5,$acc6,$t2
1145	umulh	$t2,$a2,$na0
1146	adcs	$acc6,$acc7,$t3
1147	umulh	$t3,$a3,$na0
1148	adc	$acc7,xzr,xzr
1149	adds	$acc0,$acc0,$t0
1150	umulh	$t0,$a4,$na0
1151	adcs	$acc1,$acc1,$t1
1152	umulh	$t1,$a5,$na0
1153	adcs	$acc2,$acc2,$t2
1154	umulh	$t2,$a6,$na0
1155	adcs	$acc3,$acc3,$t3
1156	umulh	$t3,$a7,$na0
1157	mul	$na0,$n0,$acc0		// next t[0]*n0
1158	adcs	$acc4,$acc4,$t0
1159	adcs	$acc5,$acc5,$t1
1160	adcs	$acc6,$acc6,$t2
1161	adc	$acc7,$acc7,$t3
1162	cbnz	$cnt,.Lsqr8x_reduction
1163
1164	ldp	$t0,$t1,[$tp,#8*0]
1165	ldp	$t2,$t3,[$tp,#8*2]
1166	mov	$rp,$tp
1167	sub	$cnt,$np_end,$np	// done yet?
1168	adds	$acc0,$acc0,$t0
1169	adcs	$acc1,$acc1,$t1
1170	ldp	$t0,$t1,[$tp,#8*4]
1171	adcs	$acc2,$acc2,$t2
1172	adcs	$acc3,$acc3,$t3
1173	ldp	$t2,$t3,[$tp,#8*6]
1174	adcs	$acc4,$acc4,$t0
1175	adcs	$acc5,$acc5,$t1
1176	adcs	$acc6,$acc6,$t2
1177	adcs	$acc7,$acc7,$t3
1178	//adc	$carry,xzr,xzr		// moved below
1179	cbz	$cnt,.Lsqr8x8_post_condition
1180
1181	ldur	$n0,[$tp,#-8*8]
1182	ldp	$a0,$a1,[$np,#8*0]
1183	ldp	$a2,$a3,[$np,#8*2]
1184	ldp	$a4,$a5,[$np,#8*4]
1185	mov	$cnt,#-8*8
1186	ldp	$a6,$a7,[$np,#8*6]
1187	add	$np,$np,#8*8
1188
1189.Lsqr8x_tail:
1190	mul	$t0,$a0,$n0
1191	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
1192	mul	$t1,$a1,$n0
1193	add	$cnt,$cnt,#8
1194	mul	$t2,$a2,$n0
1195	mul	$t3,$a3,$n0
1196	adds	$acc0,$acc0,$t0
1197	mul	$t0,$a4,$n0
1198	adcs	$acc1,$acc1,$t1
1199	mul	$t1,$a5,$n0
1200	adcs	$acc2,$acc2,$t2
1201	mul	$t2,$a6,$n0
1202	adcs	$acc3,$acc3,$t3
1203	mul	$t3,$a7,$n0
1204	adcs	$acc4,$acc4,$t0
1205	umulh	$t0,$a0,$n0
1206	adcs	$acc5,$acc5,$t1
1207	umulh	$t1,$a1,$n0
1208	adcs	$acc6,$acc6,$t2
1209	umulh	$t2,$a2,$n0
1210	adcs	$acc7,$acc7,$t3
1211	umulh	$t3,$a3,$n0
1212	adc	$carry,$carry,xzr
1213	str	$acc0,[$tp],#8
1214	adds	$acc0,$acc1,$t0
1215	umulh	$t0,$a4,$n0
1216	adcs	$acc1,$acc2,$t1
1217	umulh	$t1,$a5,$n0
1218	adcs	$acc2,$acc3,$t2
1219	umulh	$t2,$a6,$n0
1220	adcs	$acc3,$acc4,$t3
1221	umulh	$t3,$a7,$n0
1222	ldr	$n0,[$rp,$cnt]
1223	adcs	$acc4,$acc5,$t0
1224	adcs	$acc5,$acc6,$t1
1225	adcs	$acc6,$acc7,$t2
1226	adcs	$acc7,$carry,$t3
1227	//adc	$carry,xzr,xzr		// moved above
1228	cbnz	$cnt,.Lsqr8x_tail
1229					// note that carry flag is guaranteed
1230					// to be zero at this point
1231	ldp	$a0,$a1,[$tp,#8*0]
1232	sub	$cnt,$np_end,$np	// done yet?
1233	sub	$t2,$np_end,$num	// rewinded np
1234	ldp	$a2,$a3,[$tp,#8*2]
1235	ldp	$a4,$a5,[$tp,#8*4]
1236	ldp	$a6,$a7,[$tp,#8*6]
1237	cbz	$cnt,.Lsqr8x_tail_break
1238
1239	ldur	$n0,[$rp,#-8*8]
1240	adds	$acc0,$acc0,$a0
1241	adcs	$acc1,$acc1,$a1
1242	ldp	$a0,$a1,[$np,#8*0]
1243	adcs	$acc2,$acc2,$a2
1244	adcs	$acc3,$acc3,$a3
1245	ldp	$a2,$a3,[$np,#8*2]
1246	adcs	$acc4,$acc4,$a4
1247	adcs	$acc5,$acc5,$a5
1248	ldp	$a4,$a5,[$np,#8*4]
1249	adcs	$acc6,$acc6,$a6
1250	mov	$cnt,#-8*8
1251	adcs	$acc7,$acc7,$a7
1252	ldp	$a6,$a7,[$np,#8*6]
1253	add	$np,$np,#8*8
1254	//adc	$carry,xzr,xzr		// moved above
1255	b	.Lsqr8x_tail
1256
1257.align	4
1258.Lsqr8x_tail_break:
1259	ldr	$n0,[x29,#112]		// pull n0
1260	add	$cnt,$tp,#8*8		// end of current t[num] window
1261
1262	subs	xzr,$topmost,#1		// "move" top-most carry to carry bit
1263	adcs	$t0,$acc0,$a0
1264	adcs	$t1,$acc1,$a1
1265	ldp	$acc0,$acc1,[$rp,#8*0]
1266	adcs	$acc2,$acc2,$a2
1267	ldp	$a0,$a1,[$t2,#8*0]	// recall that $t2 is &n[0]
1268	adcs	$acc3,$acc3,$a3
1269	ldp	$a2,$a3,[$t2,#8*2]
1270	adcs	$acc4,$acc4,$a4
1271	adcs	$acc5,$acc5,$a5
1272	ldp	$a4,$a5,[$t2,#8*4]
1273	adcs	$acc6,$acc6,$a6
1274	adcs	$acc7,$acc7,$a7
1275	ldp	$a6,$a7,[$t2,#8*6]
1276	add	$np,$t2,#8*8
1277	adc	$topmost,xzr,xzr	// top-most carry
1278	mul	$na0,$n0,$acc0
1279	stp	$t0,$t1,[$tp,#8*0]
1280	stp	$acc2,$acc3,[$tp,#8*2]
1281	ldp	$acc2,$acc3,[$rp,#8*2]
1282	stp	$acc4,$acc5,[$tp,#8*4]
1283	ldp	$acc4,$acc5,[$rp,#8*4]
1284	cmp	$cnt,x29		// did we hit the bottom?
1285	stp	$acc6,$acc7,[$tp,#8*6]
1286	mov	$tp,$rp			// slide the window
1287	ldp	$acc6,$acc7,[$rp,#8*6]
1288	mov	$cnt,#8
1289	b.ne	.Lsqr8x_reduction
1290
1291	// Final step. We see if result is larger than modulus, and
1292	// if it is, subtract the modulus. But comparison implies
1293	// subtraction. So we subtract modulus, see if it borrowed,
1294	// and conditionally copy original value.
1295	ldr	$rp,[x29,#96]		// pull rp
1296	add	$tp,$tp,#8*8
1297	subs	$t0,$acc0,$a0
1298	sbcs	$t1,$acc1,$a1
1299	sub	$cnt,$num,#8*8
1300	mov	$ap_end,$rp		// $rp copy
1301
1302.Lsqr8x_sub:
1303	sbcs	$t2,$acc2,$a2
1304	ldp	$a0,$a1,[$np,#8*0]
1305	sbcs	$t3,$acc3,$a3
1306	stp	$t0,$t1,[$rp,#8*0]
1307	sbcs	$t0,$acc4,$a4
1308	ldp	$a2,$a3,[$np,#8*2]
1309	sbcs	$t1,$acc5,$a5
1310	stp	$t2,$t3,[$rp,#8*2]
1311	sbcs	$t2,$acc6,$a6
1312	ldp	$a4,$a5,[$np,#8*4]
1313	sbcs	$t3,$acc7,$a7
1314	ldp	$a6,$a7,[$np,#8*6]
1315	add	$np,$np,#8*8
1316	ldp	$acc0,$acc1,[$tp,#8*0]
1317	sub	$cnt,$cnt,#8*8
1318	ldp	$acc2,$acc3,[$tp,#8*2]
1319	ldp	$acc4,$acc5,[$tp,#8*4]
1320	ldp	$acc6,$acc7,[$tp,#8*6]
1321	add	$tp,$tp,#8*8
1322	stp	$t0,$t1,[$rp,#8*4]
1323	sbcs	$t0,$acc0,$a0
1324	stp	$t2,$t3,[$rp,#8*6]
1325	add	$rp,$rp,#8*8
1326	sbcs	$t1,$acc1,$a1
1327	cbnz	$cnt,.Lsqr8x_sub
1328
1329	sbcs	$t2,$acc2,$a2
1330	 mov	$tp,sp
1331	 add	$ap,sp,$num
1332	 ldp	$a0,$a1,[$ap_end,#8*0]
1333	sbcs	$t3,$acc3,$a3
1334	stp	$t0,$t1,[$rp,#8*0]
1335	sbcs	$t0,$acc4,$a4
1336	 ldp	$a2,$a3,[$ap_end,#8*2]
1337	sbcs	$t1,$acc5,$a5
1338	stp	$t2,$t3,[$rp,#8*2]
1339	sbcs	$t2,$acc6,$a6
1340	 ldp	$acc0,$acc1,[$ap,#8*0]
1341	sbcs	$t3,$acc7,$a7
1342	 ldp	$acc2,$acc3,[$ap,#8*2]
1343	sbcs	xzr,$topmost,xzr	// did it borrow?
1344	ldr	x30,[x29,#8]		// pull return address
1345	stp	$t0,$t1,[$rp,#8*4]
1346	stp	$t2,$t3,[$rp,#8*6]
1347
1348	sub	$cnt,$num,#8*4
1349.Lsqr4x_cond_copy:
1350	sub	$cnt,$cnt,#8*4
1351	csel	$t0,$acc0,$a0,lo
1352	 stp	xzr,xzr,[$tp,#8*0]
1353	csel	$t1,$acc1,$a1,lo
1354	ldp	$a0,$a1,[$ap_end,#8*4]
1355	ldp	$acc0,$acc1,[$ap,#8*4]
1356	csel	$t2,$acc2,$a2,lo
1357	 stp	xzr,xzr,[$tp,#8*2]
1358	 add	$tp,$tp,#8*4
1359	csel	$t3,$acc3,$a3,lo
1360	ldp	$a2,$a3,[$ap_end,#8*6]
1361	ldp	$acc2,$acc3,[$ap,#8*6]
1362	add	$ap,$ap,#8*4
1363	stp	$t0,$t1,[$ap_end,#8*0]
1364	stp	$t2,$t3,[$ap_end,#8*2]
1365	add	$ap_end,$ap_end,#8*4
1366	 stp	xzr,xzr,[$ap,#8*0]
1367	 stp	xzr,xzr,[$ap,#8*2]
1368	cbnz	$cnt,.Lsqr4x_cond_copy
1369
1370	csel	$t0,$acc0,$a0,lo
1371	 stp	xzr,xzr,[$tp,#8*0]
1372	csel	$t1,$acc1,$a1,lo
1373	 stp	xzr,xzr,[$tp,#8*2]
1374	csel	$t2,$acc2,$a2,lo
1375	csel	$t3,$acc3,$a3,lo
1376	stp	$t0,$t1,[$ap_end,#8*0]
1377	stp	$t2,$t3,[$ap_end,#8*2]
1378
1379	b	.Lsqr8x_done
1380
1381.align	4
1382.Lsqr8x8_post_condition:
1383	adc	$carry,xzr,xzr
1384	ldr	x30,[x29,#8]		// pull return address
1385	// $acc0-7,$carry hold result, $a0-7 hold modulus
1386	subs	$a0,$acc0,$a0
1387	ldr	$ap,[x29,#96]		// pull rp
1388	sbcs	$a1,$acc1,$a1
1389	 stp	xzr,xzr,[sp,#8*0]
1390	sbcs	$a2,$acc2,$a2
1391	 stp	xzr,xzr,[sp,#8*2]
1392	sbcs	$a3,$acc3,$a3
1393	 stp	xzr,xzr,[sp,#8*4]
1394	sbcs	$a4,$acc4,$a4
1395	 stp	xzr,xzr,[sp,#8*6]
1396	sbcs	$a5,$acc5,$a5
1397	 stp	xzr,xzr,[sp,#8*8]
1398	sbcs	$a6,$acc6,$a6
1399	 stp	xzr,xzr,[sp,#8*10]
1400	sbcs	$a7,$acc7,$a7
1401	 stp	xzr,xzr,[sp,#8*12]
1402	sbcs	$carry,$carry,xzr	// did it borrow?
1403	 stp	xzr,xzr,[sp,#8*14]
1404
1405	// $a0-7 hold result-modulus
1406	csel	$a0,$acc0,$a0,lo
1407	csel	$a1,$acc1,$a1,lo
1408	csel	$a2,$acc2,$a2,lo
1409	csel	$a3,$acc3,$a3,lo
1410	stp	$a0,$a1,[$ap,#8*0]
1411	csel	$a4,$acc4,$a4,lo
1412	csel	$a5,$acc5,$a5,lo
1413	stp	$a2,$a3,[$ap,#8*2]
1414	csel	$a6,$acc6,$a6,lo
1415	csel	$a7,$acc7,$a7,lo
1416	stp	$a4,$a5,[$ap,#8*4]
1417	stp	$a6,$a7,[$ap,#8*6]
1418
1419.Lsqr8x_done:
1420	ldp	x19,x20,[x29,#16]
1421	mov	sp,x29
1422	ldp	x21,x22,[x29,#32]
1423	mov	x0,#1
1424	ldp	x23,x24,[x29,#48]
1425	ldp	x25,x26,[x29,#64]
1426	ldp	x27,x28,[x29,#80]
1427	ldr	x29,[sp],#128
1428	.inst	0xd50323bf		// autiasp
1429	ret
1430.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1431___
1432}
1433
1434{
1435########################################################################
1436# Even though this might look as ARMv8 adaptation of mulx4x_mont from
1437# x86_64-mont5 module, it's different in sense that it performs
1438# reduction 256 bits at a time.
1439
1440my ($a0,$a1,$a2,$a3,
1441    $t0,$t1,$t2,$t3,
1442    $m0,$m1,$m2,$m3,
1443    $acc0,$acc1,$acc2,$acc3,$acc4,
1444    $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1445my  $bp_end=$rp;
1446my  ($carry,$topmost) = ($rp,"x30");
1447
1448$code.=<<___;
1449.type	__bn_mul4x_mont,%function
1450.align	5
1451__bn_mul4x_mont:
1452	.inst	0xd503233f		// paciasp
1453	stp	x29,x30,[sp,#-128]!
1454	add	x29,sp,#0
1455	stp	x19,x20,[sp,#16]
1456	stp	x21,x22,[sp,#32]
1457	stp	x23,x24,[sp,#48]
1458	stp	x25,x26,[sp,#64]
1459	stp	x27,x28,[sp,#80]
1460
1461	sub	$tp,sp,$num,lsl#3
1462	lsl	$num,$num,#3
1463	ldr	$n0,[$n0]		// *n0
1464	sub	sp,$tp,#8*4		// alloca
1465
1466	add	$t0,$bp,$num
1467	add	$ap_end,$ap,$num
1468	stp	$rp,$t0,[x29,#96]	// offload rp and &b[num]
1469
1470	ldr	$bi,[$bp,#8*0]		// b[0]
1471	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1472	ldp	$a2,$a3,[$ap,#8*2]
1473	add	$ap,$ap,#8*4
1474	mov	$acc0,xzr
1475	mov	$acc1,xzr
1476	mov	$acc2,xzr
1477	mov	$acc3,xzr
1478	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1479	ldp	$m2,$m3,[$np,#8*2]
1480	adds	$np,$np,#8*4		// clear carry bit
1481	mov	$carry,xzr
1482	mov	$cnt,#0
1483	mov	$tp,sp
1484
1485.Loop_mul4x_1st_reduction:
1486	mul	$t0,$a0,$bi		// lo(a[0..3]*b[0])
1487	adc	$carry,$carry,xzr	// modulo-scheduled
1488	mul	$t1,$a1,$bi
1489	add	$cnt,$cnt,#8
1490	mul	$t2,$a2,$bi
1491	and	$cnt,$cnt,#31
1492	mul	$t3,$a3,$bi
1493	adds	$acc0,$acc0,$t0
1494	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[0])
1495	adcs	$acc1,$acc1,$t1
1496	mul	$mi,$acc0,$n0		// t[0]*n0
1497	adcs	$acc2,$acc2,$t2
1498	umulh	$t1,$a1,$bi
1499	adcs	$acc3,$acc3,$t3
1500	umulh	$t2,$a2,$bi
1501	adc	$acc4,xzr,xzr
1502	umulh	$t3,$a3,$bi
1503	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1504	adds	$acc1,$acc1,$t0
1505	// (*)	mul	$t0,$m0,$mi	// lo(n[0..3]*t[0]*n0)
1506	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1507	adcs	$acc2,$acc2,$t1
1508	mul	$t1,$m1,$mi
1509	adcs	$acc3,$acc3,$t2
1510	mul	$t2,$m2,$mi
1511	adc	$acc4,$acc4,$t3		// can't overflow
1512	mul	$t3,$m3,$mi
1513	// (*)	adds	xzr,$acc0,$t0
1514	subs	xzr,$acc0,#1		// (*)
1515	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0)
1516	adcs	$acc0,$acc1,$t1
1517	umulh	$t1,$m1,$mi
1518	adcs	$acc1,$acc2,$t2
1519	umulh	$t2,$m2,$mi
1520	adcs	$acc2,$acc3,$t3
1521	umulh	$t3,$m3,$mi
1522	adcs	$acc3,$acc4,$carry
1523	adc	$carry,xzr,xzr
1524	adds	$acc0,$acc0,$t0
1525	sub	$t0,$ap_end,$ap
1526	adcs	$acc1,$acc1,$t1
1527	adcs	$acc2,$acc2,$t2
1528	adcs	$acc3,$acc3,$t3
1529	//adc	$carry,$carry,xzr
1530	cbnz	$cnt,.Loop_mul4x_1st_reduction
1531
1532	cbz	$t0,.Lmul4x4_post_condition
1533
1534	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1535	ldp	$a2,$a3,[$ap,#8*2]
1536	add	$ap,$ap,#8*4
1537	ldr	$mi,[sp]		// a[0]*n0
1538	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1539	ldp	$m2,$m3,[$np,#8*2]
1540	add	$np,$np,#8*4
1541
1542.Loop_mul4x_1st_tail:
1543	mul	$t0,$a0,$bi		// lo(a[4..7]*b[i])
1544	adc	$carry,$carry,xzr	// modulo-scheduled
1545	mul	$t1,$a1,$bi
1546	add	$cnt,$cnt,#8
1547	mul	$t2,$a2,$bi
1548	and	$cnt,$cnt,#31
1549	mul	$t3,$a3,$bi
1550	adds	$acc0,$acc0,$t0
1551	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[i])
1552	adcs	$acc1,$acc1,$t1
1553	umulh	$t1,$a1,$bi
1554	adcs	$acc2,$acc2,$t2
1555	umulh	$t2,$a2,$bi
1556	adcs	$acc3,$acc3,$t3
1557	umulh	$t3,$a3,$bi
1558	adc	$acc4,xzr,xzr
1559	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1560	adds	$acc1,$acc1,$t0
1561	mul	$t0,$m0,$mi		// lo(n[4..7]*a[0]*n0)
1562	adcs	$acc2,$acc2,$t1
1563	mul	$t1,$m1,$mi
1564	adcs	$acc3,$acc3,$t2
1565	mul	$t2,$m2,$mi
1566	adc	$acc4,$acc4,$t3		// can't overflow
1567	mul	$t3,$m3,$mi
1568	adds	$acc0,$acc0,$t0
1569	umulh	$t0,$m0,$mi		// hi(n[4..7]*a[0]*n0)
1570	adcs	$acc1,$acc1,$t1
1571	umulh	$t1,$m1,$mi
1572	adcs	$acc2,$acc2,$t2
1573	umulh	$t2,$m2,$mi
1574	adcs	$acc3,$acc3,$t3
1575	adcs	$acc4,$acc4,$carry
1576	umulh	$t3,$m3,$mi
1577	adc	$carry,xzr,xzr
1578	ldr	$mi,[sp,$cnt]		// next t[0]*n0
1579	str	$acc0,[$tp],#8		// result!!!
1580	adds	$acc0,$acc1,$t0
1581	sub	$t0,$ap_end,$ap		// done yet?
1582	adcs	$acc1,$acc2,$t1
1583	adcs	$acc2,$acc3,$t2
1584	adcs	$acc3,$acc4,$t3
1585	//adc	$carry,$carry,xzr
1586	cbnz	$cnt,.Loop_mul4x_1st_tail
1587
1588	sub	$t1,$ap_end,$num	// rewinded $ap
1589	cbz	$t0,.Lmul4x_proceed
1590
1591	ldp	$a0,$a1,[$ap,#8*0]
1592	ldp	$a2,$a3,[$ap,#8*2]
1593	add	$ap,$ap,#8*4
1594	ldp	$m0,$m1,[$np,#8*0]
1595	ldp	$m2,$m3,[$np,#8*2]
1596	add	$np,$np,#8*4
1597	b	.Loop_mul4x_1st_tail
1598
1599.align	5
1600.Lmul4x_proceed:
1601	ldr	$bi,[$bp,#8*4]!		// *++b
1602	adc	$topmost,$carry,xzr
1603	ldp	$a0,$a1,[$t1,#8*0]	// a[0..3]
1604	sub	$np,$np,$num		// rewind np
1605	ldp	$a2,$a3,[$t1,#8*2]
1606	add	$ap,$t1,#8*4
1607
1608	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1609	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1610	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1611	ldp	$acc2,$acc3,[sp,#8*6]
1612
1613	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1614	mov	$tp,sp
1615	ldp	$m2,$m3,[$np,#8*2]
1616	adds	$np,$np,#8*4		// clear carry bit
1617	mov	$carry,xzr
1618
1619.align	4
1620.Loop_mul4x_reduction:
1621	mul	$t0,$a0,$bi		// lo(a[0..3]*b[4])
1622	adc	$carry,$carry,xzr	// modulo-scheduled
1623	mul	$t1,$a1,$bi
1624	add	$cnt,$cnt,#8
1625	mul	$t2,$a2,$bi
1626	and	$cnt,$cnt,#31
1627	mul	$t3,$a3,$bi
1628	adds	$acc0,$acc0,$t0
1629	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[4])
1630	adcs	$acc1,$acc1,$t1
1631	mul	$mi,$acc0,$n0		// t[0]*n0
1632	adcs	$acc2,$acc2,$t2
1633	umulh	$t1,$a1,$bi
1634	adcs	$acc3,$acc3,$t3
1635	umulh	$t2,$a2,$bi
1636	adc	$acc4,xzr,xzr
1637	umulh	$t3,$a3,$bi
1638	ldr	$bi,[$bp,$cnt]		// next b[i]
1639	adds	$acc1,$acc1,$t0
1640	// (*)	mul	$t0,$m0,$mi
1641	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1642	adcs	$acc2,$acc2,$t1
1643	mul	$t1,$m1,$mi		// lo(n[0..3]*t[0]*n0
1644	adcs	$acc3,$acc3,$t2
1645	mul	$t2,$m2,$mi
1646	adc	$acc4,$acc4,$t3		// can't overflow
1647	mul	$t3,$m3,$mi
1648	// (*)	adds	xzr,$acc0,$t0
1649	subs	xzr,$acc0,#1		// (*)
1650	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0
1651	adcs	$acc0,$acc1,$t1
1652	umulh	$t1,$m1,$mi
1653	adcs	$acc1,$acc2,$t2
1654	umulh	$t2,$m2,$mi
1655	adcs	$acc2,$acc3,$t3
1656	umulh	$t3,$m3,$mi
1657	adcs	$acc3,$acc4,$carry
1658	adc	$carry,xzr,xzr
1659	adds	$acc0,$acc0,$t0
1660	adcs	$acc1,$acc1,$t1
1661	adcs	$acc2,$acc2,$t2
1662	adcs	$acc3,$acc3,$t3
1663	//adc	$carry,$carry,xzr
1664	cbnz	$cnt,.Loop_mul4x_reduction
1665
1666	adc	$carry,$carry,xzr
1667	ldp	$t0,$t1,[$tp,#8*4]	// t[4..7]
1668	ldp	$t2,$t3,[$tp,#8*6]
1669	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1670	ldp	$a2,$a3,[$ap,#8*2]
1671	add	$ap,$ap,#8*4
1672	adds	$acc0,$acc0,$t0
1673	adcs	$acc1,$acc1,$t1
1674	adcs	$acc2,$acc2,$t2
1675	adcs	$acc3,$acc3,$t3
1676	//adc	$carry,$carry,xzr
1677
1678	ldr	$mi,[sp]		// t[0]*n0
1679	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1680	ldp	$m2,$m3,[$np,#8*2]
1681	add	$np,$np,#8*4
1682
1683.align	4
1684.Loop_mul4x_tail:
1685	mul	$t0,$a0,$bi		// lo(a[4..7]*b[4])
1686	adc	$carry,$carry,xzr	// modulo-scheduled
1687	mul	$t1,$a1,$bi
1688	add	$cnt,$cnt,#8
1689	mul	$t2,$a2,$bi
1690	and	$cnt,$cnt,#31
1691	mul	$t3,$a3,$bi
1692	adds	$acc0,$acc0,$t0
1693	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[4])
1694	adcs	$acc1,$acc1,$t1
1695	umulh	$t1,$a1,$bi
1696	adcs	$acc2,$acc2,$t2
1697	umulh	$t2,$a2,$bi
1698	adcs	$acc3,$acc3,$t3
1699	umulh	$t3,$a3,$bi
1700	adc	$acc4,xzr,xzr
1701	ldr	$bi,[$bp,$cnt]		// next b[i]
1702	adds	$acc1,$acc1,$t0
1703	mul	$t0,$m0,$mi		// lo(n[4..7]*t[0]*n0)
1704	adcs	$acc2,$acc2,$t1
1705	mul	$t1,$m1,$mi
1706	adcs	$acc3,$acc3,$t2
1707	mul	$t2,$m2,$mi
1708	adc	$acc4,$acc4,$t3		// can't overflow
1709	mul	$t3,$m3,$mi
1710	adds	$acc0,$acc0,$t0
1711	umulh	$t0,$m0,$mi		// hi(n[4..7]*t[0]*n0)
1712	adcs	$acc1,$acc1,$t1
1713	umulh	$t1,$m1,$mi
1714	adcs	$acc2,$acc2,$t2
1715	umulh	$t2,$m2,$mi
1716	adcs	$acc3,$acc3,$t3
1717	umulh	$t3,$m3,$mi
1718	adcs	$acc4,$acc4,$carry
1719	ldr	$mi,[sp,$cnt]		// next a[0]*n0
1720	adc	$carry,xzr,xzr
1721	str	$acc0,[$tp],#8		// result!!!
1722	adds	$acc0,$acc1,$t0
1723	sub	$t0,$ap_end,$ap		// done yet?
1724	adcs	$acc1,$acc2,$t1
1725	adcs	$acc2,$acc3,$t2
1726	adcs	$acc3,$acc4,$t3
1727	//adc	$carry,$carry,xzr
1728	cbnz	$cnt,.Loop_mul4x_tail
1729
1730	sub	$t1,$np,$num		// rewinded np?
1731	adc	$carry,$carry,xzr
1732	cbz	$t0,.Loop_mul4x_break
1733
1734	ldp	$t0,$t1,[$tp,#8*4]
1735	ldp	$t2,$t3,[$tp,#8*6]
1736	ldp	$a0,$a1,[$ap,#8*0]
1737	ldp	$a2,$a3,[$ap,#8*2]
1738	add	$ap,$ap,#8*4
1739	adds	$acc0,$acc0,$t0
1740	adcs	$acc1,$acc1,$t1
1741	adcs	$acc2,$acc2,$t2
1742	adcs	$acc3,$acc3,$t3
1743	//adc	$carry,$carry,xzr
1744	ldp	$m0,$m1,[$np,#8*0]
1745	ldp	$m2,$m3,[$np,#8*2]
1746	add	$np,$np,#8*4
1747	b	.Loop_mul4x_tail
1748
1749.align	4
1750.Loop_mul4x_break:
1751	ldp	$t2,$t3,[x29,#96]	// pull rp and &b[num]
1752	adds	$acc0,$acc0,$topmost
1753	add	$bp,$bp,#8*4		// bp++
1754	adcs	$acc1,$acc1,xzr
1755	sub	$ap,$ap,$num		// rewind ap
1756	adcs	$acc2,$acc2,xzr
1757	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1758	adcs	$acc3,$acc3,xzr
1759	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1760	adc	$topmost,$carry,xzr
1761	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1762	cmp	$bp,$t3			// done yet?
1763	ldp	$acc2,$acc3,[sp,#8*6]
1764	ldp	$m0,$m1,[$t1,#8*0]	// n[0..3]
1765	ldp	$m2,$m3,[$t1,#8*2]
1766	add	$np,$t1,#8*4
1767	b.eq	.Lmul4x_post
1768
1769	ldr	$bi,[$bp]
1770	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1771	ldp	$a2,$a3,[$ap,#8*2]
1772	adds	$ap,$ap,#8*4		// clear carry bit
1773	mov	$carry,xzr
1774	mov	$tp,sp
1775	b	.Loop_mul4x_reduction
1776
1777.align	4
1778.Lmul4x_post:
1779	// Final step. We see if result is larger than modulus, and
1780	// if it is, subtract the modulus. But comparison implies
1781	// subtraction. So we subtract modulus, see if it borrowed,
1782	// and conditionally copy original value.
1783	mov	$rp,$t2
1784	mov	$ap_end,$t2		// $rp copy
1785	subs	$t0,$acc0,$m0
1786	add	$tp,sp,#8*8
1787	sbcs	$t1,$acc1,$m1
1788	sub	$cnt,$num,#8*4
1789
1790.Lmul4x_sub:
1791	sbcs	$t2,$acc2,$m2
1792	ldp	$m0,$m1,[$np,#8*0]
1793	sub	$cnt,$cnt,#8*4
1794	ldp	$acc0,$acc1,[$tp,#8*0]
1795	sbcs	$t3,$acc3,$m3
1796	ldp	$m2,$m3,[$np,#8*2]
1797	add	$np,$np,#8*4
1798	ldp	$acc2,$acc3,[$tp,#8*2]
1799	add	$tp,$tp,#8*4
1800	stp	$t0,$t1,[$rp,#8*0]
1801	sbcs	$t0,$acc0,$m0
1802	stp	$t2,$t3,[$rp,#8*2]
1803	add	$rp,$rp,#8*4
1804	sbcs	$t1,$acc1,$m1
1805	cbnz	$cnt,.Lmul4x_sub
1806
1807	sbcs	$t2,$acc2,$m2
1808	 mov	$tp,sp
1809	 add	$ap,sp,#8*4
1810	 ldp	$a0,$a1,[$ap_end,#8*0]
1811	sbcs	$t3,$acc3,$m3
1812	stp	$t0,$t1,[$rp,#8*0]
1813	 ldp	$a2,$a3,[$ap_end,#8*2]
1814	stp	$t2,$t3,[$rp,#8*2]
1815	 ldp	$acc0,$acc1,[$ap,#8*0]
1816	 ldp	$acc2,$acc3,[$ap,#8*2]
1817	sbcs	xzr,$topmost,xzr	// did it borrow?
1818	ldr	x30,[x29,#8]		// pull return address
1819
1820	sub	$cnt,$num,#8*4
1821.Lmul4x_cond_copy:
1822	sub	$cnt,$cnt,#8*4
1823	csel	$t0,$acc0,$a0,lo
1824	 stp	xzr,xzr,[$tp,#8*0]
1825	csel	$t1,$acc1,$a1,lo
1826	ldp	$a0,$a1,[$ap_end,#8*4]
1827	ldp	$acc0,$acc1,[$ap,#8*4]
1828	csel	$t2,$acc2,$a2,lo
1829	 stp	xzr,xzr,[$tp,#8*2]
1830	 add	$tp,$tp,#8*4
1831	csel	$t3,$acc3,$a3,lo
1832	ldp	$a2,$a3,[$ap_end,#8*6]
1833	ldp	$acc2,$acc3,[$ap,#8*6]
1834	add	$ap,$ap,#8*4
1835	stp	$t0,$t1,[$ap_end,#8*0]
1836	stp	$t2,$t3,[$ap_end,#8*2]
1837	add	$ap_end,$ap_end,#8*4
1838	cbnz	$cnt,.Lmul4x_cond_copy
1839
1840	csel	$t0,$acc0,$a0,lo
1841	 stp	xzr,xzr,[$tp,#8*0]
1842	csel	$t1,$acc1,$a1,lo
1843	 stp	xzr,xzr,[$tp,#8*2]
1844	csel	$t2,$acc2,$a2,lo
1845	 stp	xzr,xzr,[$tp,#8*3]
1846	csel	$t3,$acc3,$a3,lo
1847	 stp	xzr,xzr,[$tp,#8*4]
1848	stp	$t0,$t1,[$ap_end,#8*0]
1849	stp	$t2,$t3,[$ap_end,#8*2]
1850
1851	b	.Lmul4x_done
1852
1853.align	4
1854.Lmul4x4_post_condition:
1855	adc	$carry,$carry,xzr
1856	ldr	$ap,[x29,#96]		// pull rp
1857	// $acc0-3,$carry hold result, $m0-7 hold modulus
1858	subs	$a0,$acc0,$m0
1859	ldr	x30,[x29,#8]		// pull return address
1860	sbcs	$a1,$acc1,$m1
1861	 stp	xzr,xzr,[sp,#8*0]
1862	sbcs	$a2,$acc2,$m2
1863	 stp	xzr,xzr,[sp,#8*2]
1864	sbcs	$a3,$acc3,$m3
1865	 stp	xzr,xzr,[sp,#8*4]
1866	sbcs	xzr,$carry,xzr		// did it borrow?
1867	 stp	xzr,xzr,[sp,#8*6]
1868
1869	// $a0-3 hold result-modulus
1870	csel	$a0,$acc0,$a0,lo
1871	csel	$a1,$acc1,$a1,lo
1872	csel	$a2,$acc2,$a2,lo
1873	csel	$a3,$acc3,$a3,lo
1874	stp	$a0,$a1,[$ap,#8*0]
1875	stp	$a2,$a3,[$ap,#8*2]
1876
1877.Lmul4x_done:
1878	ldp	x19,x20,[x29,#16]
1879	mov	sp,x29
1880	ldp	x21,x22,[x29,#32]
1881	mov	x0,#1
1882	ldp	x23,x24,[x29,#48]
1883	ldp	x25,x26,[x29,#64]
1884	ldp	x27,x28,[x29,#80]
1885	ldr	x29,[sp],#128
1886	.inst	0xd50323bf		// autiasp
1887	ret
1888.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1889___
1890}
1891$code.=<<___;
1892.asciz	"Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1893.align	4
1894___
1895
1896print $code;
1897
1898close STDOUT or die "error closing STDOUT: $!";
1899