1#! /usr/bin/env perl
2# Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2015
18#
19# "Teaser" Montgomery multiplication module for ARMv8. Needs more
20# work. While it does improve RSA sign performance by 20-30% (less for
21# longer keys) on most processors, for some reason RSA2048 is not
22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23# instruction issue rate is limited on processor in question, meaning
24# that dedicated squaring procedure is a must. Well, actually all
25# contemporary AArch64 processors seem to have limited multiplication
26# issue rate, i.e. they can't issue multiplication every cycle, which
27# explains moderate improvement coefficients in comparison to
28# compiler-generated code. Recall that compiler is instructed to use
29# umulh and therefore uses same amount of multiplication instructions
30# to do the job. Assembly's edge is to minimize number of "collateral"
31# instructions and of course instruction scheduling.
32#
33# April 2015
34#
35# Squaring procedure that handles lengths divisible by 8 improves
36# RSA/DSA performance by 25-40-60% depending on processor and key
37# length. Overall improvement coefficients are always positive in
38# comparison to compiler-generated code. On Cortex-A57 improvement
39# is still modest on longest key lengths, while others exhibit e.g.
40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41# on Cortex-A57 and ~60-100% faster on others.
42
43# $output is the last argument if it looks like a file (it has an extension)
44# $flavour is the first argument if it doesn't look like a file
45my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
51die "can't locate arm-xlate.pl";
52
53open OUT,"| \"$^X\" $xlate $flavour \"$output\""
54    or die "can't call $xlate: $1";
55*STDOUT=*OUT;
56
57($lo0,$hi0,$aj,$m0,$alo,$ahi,
58 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
59 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
60
61# int bn_mul_mont(
62$rp="x0";	# BN_ULONG *rp,
63$ap="x1";	# const BN_ULONG *ap,
64$bp="x2";	# const BN_ULONG *bp,
65$np="x3";	# const BN_ULONG *np,
66$n0="x4";	# const BN_ULONG *n0,
67$num="x5";	# int num);
68
69$code.=<<___;
70#include "arm_arch.h"
71#ifndef	__KERNEL__
72.extern OPENSSL_armv8_rsa_neonized
73.hidden OPENSSL_armv8_rsa_neonized
74#endif
75.text
76
77.globl	bn_mul_mont
78.type	bn_mul_mont,%function
79.align	5
80bn_mul_mont:
81	AARCH64_SIGN_LINK_REGISTER
82.Lbn_mul_mont:
83	tst	$num,#3
84	b.ne	.Lmul_mont
85	cmp	$num,#32
86	b.le	.Lscalar_impl
87#ifndef	__KERNEL__
88	adrp	x17,OPENSSL_armv8_rsa_neonized
89	ldr	w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
90	cbnz	w17, bn_mul8x_mont_neon
91#endif
92
93.Lscalar_impl:
94	tst	$num,#7
95	b.eq	__bn_sqr8x_mont
96	tst	$num,#3
97	b.eq	__bn_mul4x_mont
98
99.Lmul_mont:
100	stp	x29,x30,[sp,#-64]!
101	add	x29,sp,#0
102	stp	x19,x20,[sp,#16]
103	stp	x21,x22,[sp,#32]
104	stp	x23,x24,[sp,#48]
105
106	ldr	$m0,[$bp],#8		// bp[0]
107	sub	$tp,sp,$num,lsl#3
108	ldp	$hi0,$aj,[$ap],#16	// ap[0..1]
109	lsl	$num,$num,#3
110	ldr	$n0,[$n0]		// *n0
111	and	$tp,$tp,#-16		// ABI says so
112	ldp	$hi1,$nj,[$np],#16	// np[0..1]
113
114	mul	$lo0,$hi0,$m0		// ap[0]*bp[0]
115	sub	$j,$num,#16		// j=num-2
116	umulh	$hi0,$hi0,$m0
117	mul	$alo,$aj,$m0		// ap[1]*bp[0]
118	umulh	$ahi,$aj,$m0
119
120	mul	$m1,$lo0,$n0		// "tp[0]"*n0
121	mov	sp,$tp			// alloca
122
123	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
124	umulh	$hi1,$hi1,$m1
125	mul	$nlo,$nj,$m1		// np[1]*m1
126	// (*)	adds	$lo1,$lo1,$lo0	// discarded
127	// (*)	As for removal of first multiplication and addition
128	//	instructions. The outcome of first addition is
129	//	guaranteed to be zero, which leaves two computationally
130	//	significant outcomes: it either carries or not. Then
131	//	question is when does it carry? Is there alternative
132	//	way to deduce it? If you follow operations, you can
133	//	observe that condition for carry is quite simple:
134	//	$lo0 being non-zero. So that carry can be calculated
135	//	by adding -1 to $lo0. That's what next instruction does.
136	subs	xzr,$lo0,#1		// (*)
137	umulh	$nhi,$nj,$m1
138	adc	$hi1,$hi1,xzr
139	cbz	$j,.L1st_skip
140
141.L1st:
142	ldr	$aj,[$ap],#8
143	adds	$lo0,$alo,$hi0
144	sub	$j,$j,#8		// j--
145	adc	$hi0,$ahi,xzr
146
147	ldr	$nj,[$np],#8
148	adds	$lo1,$nlo,$hi1
149	mul	$alo,$aj,$m0		// ap[j]*bp[0]
150	adc	$hi1,$nhi,xzr
151	umulh	$ahi,$aj,$m0
152
153	adds	$lo1,$lo1,$lo0
154	mul	$nlo,$nj,$m1		// np[j]*m1
155	adc	$hi1,$hi1,xzr
156	umulh	$nhi,$nj,$m1
157	str	$lo1,[$tp],#8		// tp[j-1]
158	cbnz	$j,.L1st
159
160.L1st_skip:
161	adds	$lo0,$alo,$hi0
162	sub	$ap,$ap,$num		// rewind $ap
163	adc	$hi0,$ahi,xzr
164
165	adds	$lo1,$nlo,$hi1
166	sub	$np,$np,$num		// rewind $np
167	adc	$hi1,$nhi,xzr
168
169	adds	$lo1,$lo1,$lo0
170	sub	$i,$num,#8		// i=num-1
171	adcs	$hi1,$hi1,$hi0
172
173	adc	$ovf,xzr,xzr		// upmost overflow bit
174	stp	$lo1,$hi1,[$tp]
175
176.Louter:
177	ldr	$m0,[$bp],#8		// bp[i]
178	ldp	$hi0,$aj,[$ap],#16
179	ldr	$tj,[sp]		// tp[0]
180	add	$tp,sp,#8
181
182	mul	$lo0,$hi0,$m0		// ap[0]*bp[i]
183	sub	$j,$num,#16		// j=num-2
184	umulh	$hi0,$hi0,$m0
185	ldp	$hi1,$nj,[$np],#16
186	mul	$alo,$aj,$m0		// ap[1]*bp[i]
187	adds	$lo0,$lo0,$tj
188	umulh	$ahi,$aj,$m0
189	adc	$hi0,$hi0,xzr
190
191	mul	$m1,$lo0,$n0
192	sub	$i,$i,#8		// i--
193
194	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
195	umulh	$hi1,$hi1,$m1
196	mul	$nlo,$nj,$m1		// np[1]*m1
197	// (*)	adds	$lo1,$lo1,$lo0
198	subs	xzr,$lo0,#1		// (*)
199	umulh	$nhi,$nj,$m1
200	cbz	$j,.Linner_skip
201
202.Linner:
203	ldr	$aj,[$ap],#8
204	adc	$hi1,$hi1,xzr
205	ldr	$tj,[$tp],#8		// tp[j]
206	adds	$lo0,$alo,$hi0
207	sub	$j,$j,#8		// j--
208	adc	$hi0,$ahi,xzr
209
210	adds	$lo1,$nlo,$hi1
211	ldr	$nj,[$np],#8
212	adc	$hi1,$nhi,xzr
213
214	mul	$alo,$aj,$m0		// ap[j]*bp[i]
215	adds	$lo0,$lo0,$tj
216	umulh	$ahi,$aj,$m0
217	adc	$hi0,$hi0,xzr
218
219	mul	$nlo,$nj,$m1		// np[j]*m1
220	adds	$lo1,$lo1,$lo0
221	umulh	$nhi,$nj,$m1
222	stur	$lo1,[$tp,#-16]		// tp[j-1]
223	cbnz	$j,.Linner
224
225.Linner_skip:
226	ldr	$tj,[$tp],#8		// tp[j]
227	adc	$hi1,$hi1,xzr
228	adds	$lo0,$alo,$hi0
229	sub	$ap,$ap,$num		// rewind $ap
230	adc	$hi0,$ahi,xzr
231
232	adds	$lo1,$nlo,$hi1
233	sub	$np,$np,$num		// rewind $np
234	adcs	$hi1,$nhi,$ovf
235	adc	$ovf,xzr,xzr
236
237	adds	$lo0,$lo0,$tj
238	adc	$hi0,$hi0,xzr
239
240	adds	$lo1,$lo1,$lo0
241	adcs	$hi1,$hi1,$hi0
242	adc	$ovf,$ovf,xzr		// upmost overflow bit
243	stp	$lo1,$hi1,[$tp,#-16]
244
245	cbnz	$i,.Louter
246
247	// Final step. We see if result is larger than modulus, and
248	// if it is, subtract the modulus. But comparison implies
249	// subtraction. So we subtract modulus, see if it borrowed,
250	// and conditionally copy original value.
251	ldr	$tj,[sp]		// tp[0]
252	add	$tp,sp,#8
253	ldr	$nj,[$np],#8		// np[0]
254	subs	$j,$num,#8		// j=num-1 and clear borrow
255	mov	$ap,$rp
256.Lsub:
257	sbcs	$aj,$tj,$nj		// tp[j]-np[j]
258	ldr	$tj,[$tp],#8
259	sub	$j,$j,#8		// j--
260	ldr	$nj,[$np],#8
261	str	$aj,[$ap],#8		// rp[j]=tp[j]-np[j]
262	cbnz	$j,.Lsub
263
264	sbcs	$aj,$tj,$nj
265	sbcs	$ovf,$ovf,xzr		// did it borrow?
266	str	$aj,[$ap],#8		// rp[num-1]
267
268	ldr	$tj,[sp]		// tp[0]
269	add	$tp,sp,#8
270	ldr	$aj,[$rp],#8		// rp[0]
271	sub	$num,$num,#8		// num--
272	nop
273.Lcond_copy:
274	sub	$num,$num,#8		// num--
275	csel	$nj,$tj,$aj,lo		// did it borrow?
276	ldr	$tj,[$tp],#8
277	ldr	$aj,[$rp],#8
278	stur	xzr,[$tp,#-16]		// wipe tp
279	stur	$nj,[$rp,#-16]
280	cbnz	$num,.Lcond_copy
281
282	csel	$nj,$tj,$aj,lo
283	stur	xzr,[$tp,#-8]		// wipe tp
284	stur	$nj,[$rp,#-8]
285
286	ldp	x19,x20,[x29,#16]
287	mov	sp,x29
288	ldp	x21,x22,[x29,#32]
289	mov	x0,#1
290	ldp	x23,x24,[x29,#48]
291	ldr	x29,[sp],#64
292	AARCH64_VALIDATE_LINK_REGISTER
293	ret
294.size	bn_mul_mont,.-bn_mul_mont
295___
296{
297my ($A0,$A1,$N0,$N1)=map("v$_",(0..3));
298my ($Z,$Temp)=("v4.16b","v5");
299my @ACC=map("v$_",(6..13));
300my ($Bi,$Ni,$M0)=map("v$_",(28..30));
301my $sBi="s28";
302my $sM0="s30";
303my $zero="v14";
304my $temp="v15";
305my $ACCTemp="v16";
306
307my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5));
308my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11));
309
310$code.=<<___;
311.type	bn_mul8x_mont_neon,%function
312.align	5
313bn_mul8x_mont_neon:
314	// Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
315	// only from bn_mul_mont which has already signed the return address.
316	stp	x29,x30,[sp,#-80]!
317	mov	x16,sp
318	stp	d8,d9,[sp,#16]
319	stp	d10,d11,[sp,#32]
320	stp	d12,d13,[sp,#48]
321	stp	d14,d15,[sp,#64]
322	lsl	$num,$num,#1
323	eor	$zero.16b,$zero.16b,$zero.16b
324
325.align	4
326.LNEON_8n:
327	eor	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b
328	sub	$toutptr,sp,#128
329	eor	@ACC[1].16b,@ACC[1].16b,@ACC[1].16b
330	sub	$toutptr,$toutptr,$num,lsl#4
331	eor	@ACC[2].16b,@ACC[2].16b,@ACC[2].16b
332	and	$toutptr,$toutptr,#-64
333	eor	@ACC[3].16b,@ACC[3].16b,@ACC[3].16b
334	mov	sp,$toutptr		// alloca
335	eor	@ACC[4].16b,@ACC[4].16b,@ACC[4].16b
336	add	$toutptr,$toutptr,#256
337	eor	@ACC[5].16b,@ACC[5].16b,@ACC[5].16b
338	sub	$inner,$num,#8
339	eor	@ACC[6].16b,@ACC[6].16b,@ACC[6].16b
340	eor	@ACC[7].16b,@ACC[7].16b,@ACC[7].16b
341
342.LNEON_8n_init:
343	st1	{@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
344	subs	$inner,$inner,#8
345	st1	{@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
346	st1	{@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
347	st1	{@ACC[6].2d,@ACC[7].2d},[$toutptr],#32
348	bne	.LNEON_8n_init
349
350	add	$tinptr,sp,#256
351	ld1	{$A0.4s,$A1.4s},[$aptr],#32
352	add	$bnptr,sp,#8
353	ldr	$sM0,[$n0],#4
354	mov	$outer,$num
355	b	.LNEON_8n_outer
356
357.align	4
358.LNEON_8n_outer:
359	ldr	$sBi,[$bptr],#4   // *b++
360	uxtl	$Bi.4s,$Bi.4h
361	add	$toutptr,sp,#128
362	ld1	{$N0.4s,$N1.4s},[$nptr],#32
363
364	umlal	@ACC[0].2d,$Bi.2s,$A0.s[0]
365	umlal	@ACC[1].2d,$Bi.2s,$A0.s[1]
366	umlal	@ACC[2].2d,$Bi.2s,$A0.s[2]
367	shl	$Ni.2d,@ACC[0].2d,#16
368	ext	$Ni.16b,$Ni.16b,$Ni.16b,#8
369	umlal	@ACC[3].2d,$Bi.2s,$A0.s[3]
370	add	$Ni.2d,$Ni.2d,@ACC[0].2d
371	umlal	@ACC[4].2d,$Bi.2s,$A1.s[0]
372	mul	$Ni.2s,$Ni.2s,$M0.2s
373	umlal	@ACC[5].2d,$Bi.2s,$A1.s[1]
374	st1	{$Bi.2s},[sp]		// put aside smashed b[8*i+0]
375	umlal	@ACC[6].2d,$Bi.2s,$A1.s[2]
376	uxtl	$Ni.4s,$Ni.4h
377	umlal	@ACC[7].2d,$Bi.2s,$A1.s[3]
378___
379for ($i=0; $i<7;) {
380$code.=<<___;
381	ldr	$sBi,[$bptr],#4   // *b++
382	umlal	@ACC[0].2d,$Ni.2s,$N0.s[0]
383	umlal	@ACC[1].2d,$Ni.2s,$N0.s[1]
384	uxtl	$Bi.4s,$Bi.4h
385	umlal	@ACC[2].2d,$Ni.2s,$N0.s[2]
386	ushr	$temp.2d,@ACC[0].2d,#16
387	umlal	@ACC[3].2d,$Ni.2s,$N0.s[3]
388	umlal	@ACC[4].2d,$Ni.2s,$N1.s[0]
389	ext	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
390	add	@ACC[0].2d,@ACC[0].2d,$temp.2d
391	umlal	@ACC[5].2d,$Ni.2s,$N1.s[1]
392	ushr	@ACC[0].2d,@ACC[0].2d,#16
393	umlal	@ACC[6].2d,$Ni.2s,$N1.s[2]
394	umlal	@ACC[7].2d,$Ni.2s,$N1.s[3]
395	add	$ACCTemp.2d,@ACC[1].2d,@ACC[0].2d
396	ins	@ACC[1].d[0],$ACCTemp.d[0]
397	st1	{$Ni.2s},[$bnptr],#8	// put aside smashed m[8*i+$i]
398___
399	push(@ACC,shift(@ACC));	$i++;
400$code.=<<___;
401	umlal	@ACC[0].2d,$Bi.2s,$A0.s[0]
402	ld1	{@ACC[7].2d},[$tinptr],#16
403	umlal	@ACC[1].2d,$Bi.2s,$A0.s[1]
404	umlal	@ACC[2].2d,$Bi.2s,$A0.s[2]
405	shl	$Ni.2d,@ACC[0].2d,#16
406	ext	$Ni.16b,$Ni.16b,$Ni.16b,#8
407	umlal	@ACC[3].2d,$Bi.2s,$A0.s[3]
408	add	$Ni.2d,$Ni.2d,@ACC[0].2d
409	umlal	@ACC[4].2d,$Bi.2s,$A1.s[0]
410	mul	$Ni.2s,$Ni.2s,$M0.2s
411	umlal	@ACC[5].2d,$Bi.2s,$A1.s[1]
412	st1	{$Bi.2s},[$bnptr],#8	// put aside smashed b[8*i+$i]
413	umlal	@ACC[6].2d,$Bi.2s,$A1.s[2]
414	uxtl	$Ni.4s,$Ni.4h
415	umlal	@ACC[7].2d,$Bi.2s,$A1.s[3]
416___
417}
418$code.=<<___;
419	ld1	{$Bi.2s},[sp]		// pull smashed b[8*i+0]
420	umlal	@ACC[0].2d,$Ni.2s,$N0.s[0]
421	ld1	{$A0.4s,$A1.4s},[$aptr],#32
422	umlal	@ACC[1].2d,$Ni.2s,$N0.s[1]
423	umlal	@ACC[2].2d,$Ni.2s,$N0.s[2]
424	mov	$Temp.16b,@ACC[0].16b
425	ushr	$Temp.2d,$Temp.2d,#16
426	ext	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
427	umlal	@ACC[3].2d,$Ni.2s,$N0.s[3]
428	umlal	@ACC[4].2d,$Ni.2s,$N1.s[0]
429	add	@ACC[0].2d,@ACC[0].2d,$Temp.2d
430	umlal	@ACC[5].2d,$Ni.2s,$N1.s[1]
431	ushr	@ACC[0].2d,@ACC[0].2d,#16
432	eor	$temp.16b,$temp.16b,$temp.16b
433	ins	@ACC[0].d[1],$temp.d[0]
434	umlal	@ACC[6].2d,$Ni.2s,$N1.s[2]
435	umlal	@ACC[7].2d,$Ni.2s,$N1.s[3]
436	add	@ACC[1].2d,@ACC[1].2d,@ACC[0].2d
437	st1	{$Ni.2s},[$bnptr],#8	// put aside smashed m[8*i+$i]
438	add	$bnptr,sp,#8		// rewind
439___
440	push(@ACC,shift(@ACC));
441$code.=<<___;
442	sub	$inner,$num,#8
443	b	.LNEON_8n_inner
444
445.align	4
446.LNEON_8n_inner:
447	subs	$inner,$inner,#8
448	umlal	@ACC[0].2d,$Bi.2s,$A0.s[0]
449	ld1	{@ACC[7].2d},[$tinptr]
450	umlal	@ACC[1].2d,$Bi.2s,$A0.s[1]
451	ld1	{$Ni.2s},[$bnptr],#8	// pull smashed m[8*i+0]
452	umlal	@ACC[2].2d,$Bi.2s,$A0.s[2]
453	ld1	{$N0.4s,$N1.4s},[$nptr],#32
454	umlal	@ACC[3].2d,$Bi.2s,$A0.s[3]
455	b.eq	.LInner_jump
456	add	$tinptr,$tinptr,#16	// don't advance in last iteration
457.LInner_jump:
458	umlal	@ACC[4].2d,$Bi.2s,$A1.s[0]
459	umlal	@ACC[5].2d,$Bi.2s,$A1.s[1]
460	umlal	@ACC[6].2d,$Bi.2s,$A1.s[2]
461	umlal	@ACC[7].2d,$Bi.2s,$A1.s[3]
462___
463for ($i=1; $i<8; $i++) {
464$code.=<<___;
465	ld1	{$Bi.2s},[$bnptr],#8	// pull smashed b[8*i+$i]
466	umlal	@ACC[0].2d,$Ni.2s,$N0.s[0]
467	umlal	@ACC[1].2d,$Ni.2s,$N0.s[1]
468	umlal	@ACC[2].2d,$Ni.2s,$N0.s[2]
469	umlal	@ACC[3].2d,$Ni.2s,$N0.s[3]
470	umlal	@ACC[4].2d,$Ni.2s,$N1.s[0]
471	umlal	@ACC[5].2d,$Ni.2s,$N1.s[1]
472	umlal	@ACC[6].2d,$Ni.2s,$N1.s[2]
473	umlal	@ACC[7].2d,$Ni.2s,$N1.s[3]
474	st1	{@ACC[0].2d},[$toutptr],#16
475___
476	push(@ACC,shift(@ACC));
477$code.=<<___;
478	umlal	@ACC[0].2d,$Bi.2s,$A0.s[0]
479	ld1	{@ACC[7].2d},[$tinptr]
480	umlal	@ACC[1].2d,$Bi.2s,$A0.s[1]
481	ld1	{$Ni.2s},[$bnptr],#8	// pull smashed m[8*i+$i]
482	umlal	@ACC[2].2d,$Bi.2s,$A0.s[2]
483	b.eq	.LInner_jump$i
484	add	$tinptr,$tinptr,#16	// don't advance in last iteration
485.LInner_jump$i:
486	umlal	@ACC[3].2d,$Bi.2s,$A0.s[3]
487	umlal	@ACC[4].2d,$Bi.2s,$A1.s[0]
488	umlal	@ACC[5].2d,$Bi.2s,$A1.s[1]
489	umlal	@ACC[6].2d,$Bi.2s,$A1.s[2]
490	umlal	@ACC[7].2d,$Bi.2s,$A1.s[3]
491___
492}
493$code.=<<___;
494	b.ne	.LInner_after_rewind$i
495	sub	$aptr,$aptr,$num,lsl#2	// rewind
496.LInner_after_rewind$i:
497	umlal	@ACC[0].2d,$Ni.2s,$N0.s[0]
498	ld1	{$Bi.2s},[sp]		// pull smashed b[8*i+0]
499	umlal	@ACC[1].2d,$Ni.2s,$N0.s[1]
500	ld1	{$A0.4s,$A1.4s},[$aptr],#32
501	umlal	@ACC[2].2d,$Ni.2s,$N0.s[2]
502	add	$bnptr,sp,#8		// rewind
503	umlal	@ACC[3].2d,$Ni.2s,$N0.s[3]
504	umlal	@ACC[4].2d,$Ni.2s,$N1.s[0]
505	umlal	@ACC[5].2d,$Ni.2s,$N1.s[1]
506	umlal	@ACC[6].2d,$Ni.2s,$N1.s[2]
507	st1	{@ACC[0].2d},[$toutptr],#16
508	umlal	@ACC[7].2d,$Ni.2s,$N1.s[3]
509
510	bne	.LNEON_8n_inner
511___
512	push(@ACC,shift(@ACC));
513$code.=<<___;
514	add	$tinptr,sp,#128
515	st1	{@ACC[0].2d,@ACC[1].2d},[$toutptr],#32
516	eor	$N0.16b,$N0.16b,$N0.16b	// $N0
517	st1	{@ACC[2].2d,@ACC[3].2d},[$toutptr],#32
518	eor	$N1.16b,$N1.16b,$N1.16b	// $N1
519	st1	{@ACC[4].2d,@ACC[5].2d},[$toutptr],#32
520	st1	{@ACC[6].2d},[$toutptr]
521
522	subs	$outer,$outer,#8
523	ld1	{@ACC[0].2d,@ACC[1].2d},[$tinptr],#32
524	ld1	{@ACC[2].2d,@ACC[3].2d},[$tinptr],#32
525	ld1	{@ACC[4].2d,@ACC[5].2d},[$tinptr],#32
526	ld1	{@ACC[6].2d,@ACC[7].2d},[$tinptr],#32
527
528	b.eq	.LInner_8n_jump_2steps
529	sub	$nptr,$nptr,$num,lsl#2	// rewind
530	b	.LNEON_8n_outer
531
532.LInner_8n_jump_2steps:
533	add	$toutptr,sp,#128
534	st1	{$N0.2d,$N1.2d}, [sp],#32	// start wiping stack frame
535	mov	$Temp.16b,@ACC[0].16b
536	ushr	$temp.2d,@ACC[0].2d,#16
537	ext	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
538	st1	{$N0.2d,$N1.2d}, [sp],#32
539	add	@ACC[0].2d,@ACC[0].2d,$temp.2d
540	st1	{$N0.2d,$N1.2d}, [sp],#32
541	ushr	$temp.2d,@ACC[0].2d,#16
542	st1	{$N0.2d,$N1.2d}, [sp],#32
543	zip1	@ACC[0].4h,$Temp.4h,@ACC[0].4h
544	ins	$temp.d[1],$zero.d[0]
545
546	mov	$inner,$num
547	b	.LNEON_tail_entry
548
549.align	4
550.LNEON_tail:
551	add	@ACC[0].2d,@ACC[0].2d,$temp.2d
552	mov	$Temp.16b,@ACC[0].16b
553	ushr	$temp.2d,@ACC[0].2d,#16
554	ext	@ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8
555	ld1	{@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32
556	add	@ACC[0].2d,@ACC[0].2d,$temp.2d
557	ld1	{@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32
558	ushr	$temp.2d,@ACC[0].2d,#16
559	ld1	{@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32
560	zip1	@ACC[0].4h,$Temp.4h,@ACC[0].4h
561	ins	$temp.d[1],$zero.d[0]
562
563.LNEON_tail_entry:
564___
565for ($i=1; $i<8; $i++) {
566$code.=<<___;
567	add	@ACC[1].2d,@ACC[1].2d,$temp.2d
568	st1	{@ACC[0].s}[0], [$toutptr],#4
569	ushr	$temp.2d,@ACC[1].2d,#16
570	mov	$Temp.16b,@ACC[1].16b
571	ext	@ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8
572	add	@ACC[1].2d,@ACC[1].2d,$temp.2d
573	ushr	$temp.2d,@ACC[1].2d,#16
574	zip1	@ACC[1].4h,$Temp.4h,@ACC[1].4h
575	ins	$temp.d[1],$zero.d[0]
576___
577	push(@ACC,shift(@ACC));
578}
579	push(@ACC,shift(@ACC));
580$code.=<<___;
581	ld1	{@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32
582	subs	$inner,$inner,#8
583	st1	{@ACC[7].s}[0], [$toutptr],#4
584	bne	.LNEON_tail
585
586	st1	{$temp.s}[0], [$toutptr],#4	// top-most bit
587	sub	$nptr,$nptr,$num,lsl#2		// rewind $nptr
588	subs	$aptr,sp,#0			// clear carry flag
589	add	$bptr,sp,$num,lsl#2
590
591.LNEON_sub:
592	ldp	w4,w5,[$aptr],#8
593	ldp	w6,w7,[$aptr],#8
594	ldp	w8,w9,[$nptr],#8
595	ldp	w10,w11,[$nptr],#8
596	sbcs	w8,w4,w8
597	sbcs	w9,w5,w9
598	sbcs	w10,w6,w10
599	sbcs	w11,w7,w11
600	sub	x17,$bptr,$aptr
601	stp	w8,w9,[$rptr],#8
602	stp	w10,w11,[$rptr],#8
603	cbnz	x17,.LNEON_sub
604
605	ldr	w10, [$aptr]		// load top-most bit
606	mov	x11,sp
607	eor	v0.16b,v0.16b,v0.16b
608	sub	x11,$bptr,x11		// this is num*4
609	eor	v1.16b,v1.16b,v1.16b
610	mov	$aptr,sp
611	sub	$rptr,$rptr,x11		// rewind $rptr
612	mov	$nptr,$bptr		// second 3/4th of frame
613	sbcs	w10,w10,wzr		// result is carry flag
614
615.LNEON_copy_n_zap:
616	ldp	w4,w5,[$aptr],#8
617	ldp	w6,w7,[$aptr],#8
618	ldp	w8,w9,[$rptr],#8
619	ldp	w10,w11,[$rptr]
620	sub	$rptr,$rptr,#8
621	b.cs	.LCopy_1
622	mov	w8,w4
623	mov	w9,w5
624	mov	w10,w6
625	mov	w11,w7
626.LCopy_1:
627	st1	{v0.2d,v1.2d}, [$nptr],#32		// wipe
628	st1	{v0.2d,v1.2d}, [$nptr],#32		// wipe
629	ldp	w4,w5,[$aptr],#8
630	ldp	w6,w7,[$aptr],#8
631	stp	w8,w9,[$rptr],#8
632	stp	w10,w11,[$rptr],#8
633	sub	$aptr,$aptr,#32
634	ldp	w8,w9,[$rptr],#8
635	ldp	w10,w11,[$rptr]
636	sub	$rptr,$rptr,#8
637	b.cs	.LCopy_2
638	mov	w8, w4
639	mov	w9, w5
640	mov	w10, w6
641	mov	w11, w7
642.LCopy_2:
643	st1	{v0.2d,v1.2d}, [$aptr],#32		// wipe
644	st1	{v0.2d,v1.2d}, [$nptr],#32		// wipe
645	sub	x17,$bptr,$aptr		// preserves carry
646	stp	w8,w9,[$rptr],#8
647	stp	w10,w11,[$rptr],#8
648	cbnz	x17,.LNEON_copy_n_zap
649
650	mov	sp,x16
651	ldp	d14,d15,[sp,#64]
652	ldp	d12,d13,[sp,#48]
653	ldp	d10,d11,[sp,#32]
654	ldp	d8,d9,[sp,#16]
655	ldr	x29,[sp],#80
656	AARCH64_VALIDATE_LINK_REGISTER
657	ret			// bx lr
658
659.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
660___
661}
662{
663########################################################################
664# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
665
666my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
667my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
668my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
669my ($cnt,$carry,$topmost)=("x27","x28","x30");
670my ($tp,$ap_end,$na0)=($bp,$np,$carry);
671
672$code.=<<___;
673.type	__bn_sqr8x_mont,%function
674.align	5
675__bn_sqr8x_mont:
676	cmp	$ap,$bp
677	b.ne	__bn_mul4x_mont
678.Lsqr8x_mont:
679	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
680	// only from bn_mul_mont which has already signed the return address.
681	stp	x29,x30,[sp,#-128]!
682	add	x29,sp,#0
683	stp	x19,x20,[sp,#16]
684	stp	x21,x22,[sp,#32]
685	stp	x23,x24,[sp,#48]
686	stp	x25,x26,[sp,#64]
687	stp	x27,x28,[sp,#80]
688	stp	$rp,$np,[sp,#96]	// offload rp and np
689
690	ldp	$a0,$a1,[$ap,#8*0]
691	ldp	$a2,$a3,[$ap,#8*2]
692	ldp	$a4,$a5,[$ap,#8*4]
693	ldp	$a6,$a7,[$ap,#8*6]
694
695	sub	$tp,sp,$num,lsl#4
696	lsl	$num,$num,#3
697	ldr	$n0,[$n0]		// *n0
698	mov	sp,$tp			// alloca
699	sub	$cnt,$num,#8*8
700	b	.Lsqr8x_zero_start
701
702.Lsqr8x_zero:
703	sub	$cnt,$cnt,#8*8
704	stp	xzr,xzr,[$tp,#8*0]
705	stp	xzr,xzr,[$tp,#8*2]
706	stp	xzr,xzr,[$tp,#8*4]
707	stp	xzr,xzr,[$tp,#8*6]
708.Lsqr8x_zero_start:
709	stp	xzr,xzr,[$tp,#8*8]
710	stp	xzr,xzr,[$tp,#8*10]
711	stp	xzr,xzr,[$tp,#8*12]
712	stp	xzr,xzr,[$tp,#8*14]
713	add	$tp,$tp,#8*16
714	cbnz	$cnt,.Lsqr8x_zero
715
716	add	$ap_end,$ap,$num
717	add	$ap,$ap,#8*8
718	mov	$acc0,xzr
719	mov	$acc1,xzr
720	mov	$acc2,xzr
721	mov	$acc3,xzr
722	mov	$acc4,xzr
723	mov	$acc5,xzr
724	mov	$acc6,xzr
725	mov	$acc7,xzr
726	mov	$tp,sp
727	str	$n0,[x29,#112]		// offload n0
728
729	// Multiply everything but a[i]*a[i]
730.align	4
731.Lsqr8x_outer_loop:
732        //                                                 a[1]a[0]	(i)
733        //                                             a[2]a[0]
734        //                                         a[3]a[0]
735        //                                     a[4]a[0]
736        //                                 a[5]a[0]
737        //                             a[6]a[0]
738        //                         a[7]a[0]
739        //                                         a[2]a[1]		(ii)
740        //                                     a[3]a[1]
741        //                                 a[4]a[1]
742        //                             a[5]a[1]
743        //                         a[6]a[1]
744        //                     a[7]a[1]
745        //                                 a[3]a[2]			(iii)
746        //                             a[4]a[2]
747        //                         a[5]a[2]
748        //                     a[6]a[2]
749        //                 a[7]a[2]
750        //                         a[4]a[3]				(iv)
751        //                     a[5]a[3]
752        //                 a[6]a[3]
753        //             a[7]a[3]
754        //                 a[5]a[4]					(v)
755        //             a[6]a[4]
756        //         a[7]a[4]
757        //         a[6]a[5]						(vi)
758        //     a[7]a[5]
759        // a[7]a[6]							(vii)
760
761	mul	$t0,$a1,$a0		// lo(a[1..7]*a[0])		(i)
762	mul	$t1,$a2,$a0
763	mul	$t2,$a3,$a0
764	mul	$t3,$a4,$a0
765	adds	$acc1,$acc1,$t0		// t[1]+lo(a[1]*a[0])
766	mul	$t0,$a5,$a0
767	adcs	$acc2,$acc2,$t1
768	mul	$t1,$a6,$a0
769	adcs	$acc3,$acc3,$t2
770	mul	$t2,$a7,$a0
771	adcs	$acc4,$acc4,$t3
772	umulh	$t3,$a1,$a0		// hi(a[1..7]*a[0])
773	adcs	$acc5,$acc5,$t0
774	umulh	$t0,$a2,$a0
775	adcs	$acc6,$acc6,$t1
776	umulh	$t1,$a3,$a0
777	adcs	$acc7,$acc7,$t2
778	umulh	$t2,$a4,$a0
779	stp	$acc0,$acc1,[$tp],#8*2	// t[0..1]
780	adc	$acc0,xzr,xzr		// t[8]
781	adds	$acc2,$acc2,$t3		// t[2]+lo(a[1]*a[0])
782	umulh	$t3,$a5,$a0
783	adcs	$acc3,$acc3,$t0
784	umulh	$t0,$a6,$a0
785	adcs	$acc4,$acc4,$t1
786	umulh	$t1,$a7,$a0
787	adcs	$acc5,$acc5,$t2
788	 mul	$t2,$a2,$a1		// lo(a[2..7]*a[1])		(ii)
789	adcs	$acc6,$acc6,$t3
790	 mul	$t3,$a3,$a1
791	adcs	$acc7,$acc7,$t0
792	 mul	$t0,$a4,$a1
793	adc	$acc0,$acc0,$t1
794
795	mul	$t1,$a5,$a1
796	adds	$acc3,$acc3,$t2
797	mul	$t2,$a6,$a1
798	adcs	$acc4,$acc4,$t3
799	mul	$t3,$a7,$a1
800	adcs	$acc5,$acc5,$t0
801	umulh	$t0,$a2,$a1		// hi(a[2..7]*a[1])
802	adcs	$acc6,$acc6,$t1
803	umulh	$t1,$a3,$a1
804	adcs	$acc7,$acc7,$t2
805	umulh	$t2,$a4,$a1
806	adcs	$acc0,$acc0,$t3
807	umulh	$t3,$a5,$a1
808	stp	$acc2,$acc3,[$tp],#8*2	// t[2..3]
809	adc	$acc1,xzr,xzr		// t[9]
810	adds	$acc4,$acc4,$t0
811	umulh	$t0,$a6,$a1
812	adcs	$acc5,$acc5,$t1
813	umulh	$t1,$a7,$a1
814	adcs	$acc6,$acc6,$t2
815	 mul	$t2,$a3,$a2		// lo(a[3..7]*a[2])		(iii)
816	adcs	$acc7,$acc7,$t3
817	 mul	$t3,$a4,$a2
818	adcs	$acc0,$acc0,$t0
819	 mul	$t0,$a5,$a2
820	adc	$acc1,$acc1,$t1
821
822	mul	$t1,$a6,$a2
823	adds	$acc5,$acc5,$t2
824	mul	$t2,$a7,$a2
825	adcs	$acc6,$acc6,$t3
826	umulh	$t3,$a3,$a2		// hi(a[3..7]*a[2])
827	adcs	$acc7,$acc7,$t0
828	umulh	$t0,$a4,$a2
829	adcs	$acc0,$acc0,$t1
830	umulh	$t1,$a5,$a2
831	adcs	$acc1,$acc1,$t2
832	umulh	$t2,$a6,$a2
833	stp	$acc4,$acc5,[$tp],#8*2	// t[4..5]
834	adc	$acc2,xzr,xzr		// t[10]
835	adds	$acc6,$acc6,$t3
836	umulh	$t3,$a7,$a2
837	adcs	$acc7,$acc7,$t0
838	 mul	$t0,$a4,$a3		// lo(a[4..7]*a[3])		(iv)
839	adcs	$acc0,$acc0,$t1
840	 mul	$t1,$a5,$a3
841	adcs	$acc1,$acc1,$t2
842	 mul	$t2,$a6,$a3
843	adc	$acc2,$acc2,$t3
844
845	mul	$t3,$a7,$a3
846	adds	$acc7,$acc7,$t0
847	umulh	$t0,$a4,$a3		// hi(a[4..7]*a[3])
848	adcs	$acc0,$acc0,$t1
849	umulh	$t1,$a5,$a3
850	adcs	$acc1,$acc1,$t2
851	umulh	$t2,$a6,$a3
852	adcs	$acc2,$acc2,$t3
853	umulh	$t3,$a7,$a3
854	stp	$acc6,$acc7,[$tp],#8*2	// t[6..7]
855	adc	$acc3,xzr,xzr		// t[11]
856	adds	$acc0,$acc0,$t0
857	 mul	$t0,$a5,$a4		// lo(a[5..7]*a[4])		(v)
858	adcs	$acc1,$acc1,$t1
859	 mul	$t1,$a6,$a4
860	adcs	$acc2,$acc2,$t2
861	 mul	$t2,$a7,$a4
862	adc	$acc3,$acc3,$t3
863
864	umulh	$t3,$a5,$a4		// hi(a[5..7]*a[4])
865	adds	$acc1,$acc1,$t0
866	umulh	$t0,$a6,$a4
867	adcs	$acc2,$acc2,$t1
868	umulh	$t1,$a7,$a4
869	adcs	$acc3,$acc3,$t2
870	 mul	$t2,$a6,$a5		// lo(a[6..7]*a[5])		(vi)
871	adc	$acc4,xzr,xzr		// t[12]
872	adds	$acc2,$acc2,$t3
873	 mul	$t3,$a7,$a5
874	adcs	$acc3,$acc3,$t0
875	 umulh	$t0,$a6,$a5		// hi(a[6..7]*a[5])
876	adc	$acc4,$acc4,$t1
877
878	umulh	$t1,$a7,$a5
879	adds	$acc3,$acc3,$t2
880	 mul	$t2,$a7,$a6		// lo(a[7]*a[6])		(vii)
881	adcs	$acc4,$acc4,$t3
882	 umulh	$t3,$a7,$a6		// hi(a[7]*a[6])
883	adc	$acc5,xzr,xzr		// t[13]
884	adds	$acc4,$acc4,$t0
885	sub	$cnt,$ap_end,$ap	// done yet?
886	adc	$acc5,$acc5,$t1
887
888	adds	$acc5,$acc5,$t2
889	sub	$t0,$ap_end,$num	// rewinded ap
890	adc	$acc6,xzr,xzr		// t[14]
891	add	$acc6,$acc6,$t3
892
893	cbz	$cnt,.Lsqr8x_outer_break
894
895	mov	$n0,$a0
896	ldp	$a0,$a1,[$tp,#8*0]
897	ldp	$a2,$a3,[$tp,#8*2]
898	ldp	$a4,$a5,[$tp,#8*4]
899	ldp	$a6,$a7,[$tp,#8*6]
900	adds	$acc0,$acc0,$a0
901	adcs	$acc1,$acc1,$a1
902	ldp	$a0,$a1,[$ap,#8*0]
903	adcs	$acc2,$acc2,$a2
904	adcs	$acc3,$acc3,$a3
905	ldp	$a2,$a3,[$ap,#8*2]
906	adcs	$acc4,$acc4,$a4
907	adcs	$acc5,$acc5,$a5
908	ldp	$a4,$a5,[$ap,#8*4]
909	adcs	$acc6,$acc6,$a6
910	mov	$rp,$ap
911	adcs	$acc7,xzr,$a7
912	ldp	$a6,$a7,[$ap,#8*6]
913	add	$ap,$ap,#8*8
914	//adc	$carry,xzr,xzr		// moved below
915	mov	$cnt,#-8*8
916
917	//                                                         a[8]a[0]
918	//                                                     a[9]a[0]
919	//                                                 a[a]a[0]
920	//                                             a[b]a[0]
921	//                                         a[c]a[0]
922	//                                     a[d]a[0]
923	//                                 a[e]a[0]
924	//                             a[f]a[0]
925	//                                                     a[8]a[1]
926	//                         a[f]a[1]........................
927	//                                                 a[8]a[2]
928	//                     a[f]a[2]........................
929	//                                             a[8]a[3]
930	//                 a[f]a[3]........................
931	//                                         a[8]a[4]
932	//             a[f]a[4]........................
933	//                                     a[8]a[5]
934	//         a[f]a[5]........................
935	//                                 a[8]a[6]
936	//     a[f]a[6]........................
937	//                             a[8]a[7]
938	// a[f]a[7]........................
939.Lsqr8x_mul:
940	mul	$t0,$a0,$n0
941	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
942	mul	$t1,$a1,$n0
943	add	$cnt,$cnt,#8
944	mul	$t2,$a2,$n0
945	mul	$t3,$a3,$n0
946	adds	$acc0,$acc0,$t0
947	mul	$t0,$a4,$n0
948	adcs	$acc1,$acc1,$t1
949	mul	$t1,$a5,$n0
950	adcs	$acc2,$acc2,$t2
951	mul	$t2,$a6,$n0
952	adcs	$acc3,$acc3,$t3
953	mul	$t3,$a7,$n0
954	adcs	$acc4,$acc4,$t0
955	umulh	$t0,$a0,$n0
956	adcs	$acc5,$acc5,$t1
957	umulh	$t1,$a1,$n0
958	adcs	$acc6,$acc6,$t2
959	umulh	$t2,$a2,$n0
960	adcs	$acc7,$acc7,$t3
961	umulh	$t3,$a3,$n0
962	adc	$carry,$carry,xzr
963	str	$acc0,[$tp],#8
964	adds	$acc0,$acc1,$t0
965	umulh	$t0,$a4,$n0
966	adcs	$acc1,$acc2,$t1
967	umulh	$t1,$a5,$n0
968	adcs	$acc2,$acc3,$t2
969	umulh	$t2,$a6,$n0
970	adcs	$acc3,$acc4,$t3
971	umulh	$t3,$a7,$n0
972	ldr	$n0,[$rp,$cnt]
973	adcs	$acc4,$acc5,$t0
974	adcs	$acc5,$acc6,$t1
975	adcs	$acc6,$acc7,$t2
976	adcs	$acc7,$carry,$t3
977	//adc	$carry,xzr,xzr		// moved above
978	cbnz	$cnt,.Lsqr8x_mul
979					// note that carry flag is guaranteed
980					// to be zero at this point
981	cmp	$ap,$ap_end		// done yet?
982	b.eq	.Lsqr8x_break
983
984	ldp	$a0,$a1,[$tp,#8*0]
985	ldp	$a2,$a3,[$tp,#8*2]
986	ldp	$a4,$a5,[$tp,#8*4]
987	ldp	$a6,$a7,[$tp,#8*6]
988	adds	$acc0,$acc0,$a0
989	ldur	$n0,[$rp,#-8*8]
990	adcs	$acc1,$acc1,$a1
991	ldp	$a0,$a1,[$ap,#8*0]
992	adcs	$acc2,$acc2,$a2
993	adcs	$acc3,$acc3,$a3
994	ldp	$a2,$a3,[$ap,#8*2]
995	adcs	$acc4,$acc4,$a4
996	adcs	$acc5,$acc5,$a5
997	ldp	$a4,$a5,[$ap,#8*4]
998	adcs	$acc6,$acc6,$a6
999	mov	$cnt,#-8*8
1000	adcs	$acc7,$acc7,$a7
1001	ldp	$a6,$a7,[$ap,#8*6]
1002	add	$ap,$ap,#8*8
1003	//adc	$carry,xzr,xzr		// moved above
1004	b	.Lsqr8x_mul
1005
1006.align	4
1007.Lsqr8x_break:
1008	ldp	$a0,$a1,[$rp,#8*0]
1009	add	$ap,$rp,#8*8
1010	ldp	$a2,$a3,[$rp,#8*2]
1011	sub	$t0,$ap_end,$ap		// is it last iteration?
1012	ldp	$a4,$a5,[$rp,#8*4]
1013	sub	$t1,$tp,$t0
1014	ldp	$a6,$a7,[$rp,#8*6]
1015	cbz	$t0,.Lsqr8x_outer_loop
1016
1017	stp	$acc0,$acc1,[$tp,#8*0]
1018	ldp	$acc0,$acc1,[$t1,#8*0]
1019	stp	$acc2,$acc3,[$tp,#8*2]
1020	ldp	$acc2,$acc3,[$t1,#8*2]
1021	stp	$acc4,$acc5,[$tp,#8*4]
1022	ldp	$acc4,$acc5,[$t1,#8*4]
1023	stp	$acc6,$acc7,[$tp,#8*6]
1024	mov	$tp,$t1
1025	ldp	$acc6,$acc7,[$t1,#8*6]
1026	b	.Lsqr8x_outer_loop
1027
1028.align	4
1029.Lsqr8x_outer_break:
1030	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1031	ldp	$a1,$a3,[$t0,#8*0]	// recall that $t0 is &a[0]
1032	ldp	$t1,$t2,[sp,#8*1]
1033	ldp	$a5,$a7,[$t0,#8*2]
1034	add	$ap,$t0,#8*4
1035	ldp	$t3,$t0,[sp,#8*3]
1036
1037	stp	$acc0,$acc1,[$tp,#8*0]
1038	mul	$acc0,$a1,$a1
1039	stp	$acc2,$acc3,[$tp,#8*2]
1040	umulh	$a1,$a1,$a1
1041	stp	$acc4,$acc5,[$tp,#8*4]
1042	mul	$a2,$a3,$a3
1043	stp	$acc6,$acc7,[$tp,#8*6]
1044	mov	$tp,sp
1045	umulh	$a3,$a3,$a3
1046	adds	$acc1,$a1,$t1,lsl#1
1047	extr	$t1,$t2,$t1,#63
1048	sub	$cnt,$num,#8*4
1049
1050.Lsqr4x_shift_n_add:
1051	adcs	$acc2,$a2,$t1
1052	extr	$t2,$t3,$t2,#63
1053	sub	$cnt,$cnt,#8*4
1054	adcs	$acc3,$a3,$t2
1055	ldp	$t1,$t2,[$tp,#8*5]
1056	mul	$a4,$a5,$a5
1057	ldp	$a1,$a3,[$ap],#8*2
1058	umulh	$a5,$a5,$a5
1059	mul	$a6,$a7,$a7
1060	umulh	$a7,$a7,$a7
1061	extr	$t3,$t0,$t3,#63
1062	stp	$acc0,$acc1,[$tp,#8*0]
1063	adcs	$acc4,$a4,$t3
1064	extr	$t0,$t1,$t0,#63
1065	stp	$acc2,$acc3,[$tp,#8*2]
1066	adcs	$acc5,$a5,$t0
1067	ldp	$t3,$t0,[$tp,#8*7]
1068	extr	$t1,$t2,$t1,#63
1069	adcs	$acc6,$a6,$t1
1070	extr	$t2,$t3,$t2,#63
1071	adcs	$acc7,$a7,$t2
1072	ldp	$t1,$t2,[$tp,#8*9]
1073	mul	$a0,$a1,$a1
1074	ldp	$a5,$a7,[$ap],#8*2
1075	umulh	$a1,$a1,$a1
1076	mul	$a2,$a3,$a3
1077	umulh	$a3,$a3,$a3
1078	stp	$acc4,$acc5,[$tp,#8*4]
1079	extr	$t3,$t0,$t3,#63
1080	stp	$acc6,$acc7,[$tp,#8*6]
1081	add	$tp,$tp,#8*8
1082	adcs	$acc0,$a0,$t3
1083	extr	$t0,$t1,$t0,#63
1084	adcs	$acc1,$a1,$t0
1085	ldp	$t3,$t0,[$tp,#8*3]
1086	extr	$t1,$t2,$t1,#63
1087	cbnz	$cnt,.Lsqr4x_shift_n_add
1088___
1089my ($np,$np_end)=($ap,$ap_end);
1090$code.=<<___;
1091	 ldp	$np,$n0,[x29,#104]	// pull np and n0
1092
1093	adcs	$acc2,$a2,$t1
1094	extr	$t2,$t3,$t2,#63
1095	adcs	$acc3,$a3,$t2
1096	ldp	$t1,$t2,[$tp,#8*5]
1097	mul	$a4,$a5,$a5
1098	umulh	$a5,$a5,$a5
1099	stp	$acc0,$acc1,[$tp,#8*0]
1100	mul	$a6,$a7,$a7
1101	umulh	$a7,$a7,$a7
1102	stp	$acc2,$acc3,[$tp,#8*2]
1103	extr	$t3,$t0,$t3,#63
1104	adcs	$acc4,$a4,$t3
1105	extr	$t0,$t1,$t0,#63
1106	 ldp	$acc0,$acc1,[sp,#8*0]
1107	adcs	$acc5,$a5,$t0
1108	extr	$t1,$t2,$t1,#63
1109	 ldp	$a0,$a1,[$np,#8*0]
1110	adcs	$acc6,$a6,$t1
1111	extr	$t2,xzr,$t2,#63
1112	 ldp	$a2,$a3,[$np,#8*2]
1113	adc	$acc7,$a7,$t2
1114	 ldp	$a4,$a5,[$np,#8*4]
1115
1116	// Reduce by 512 bits per iteration
1117	mul	$na0,$n0,$acc0		// t[0]*n0
1118	ldp	$a6,$a7,[$np,#8*6]
1119	add	$np_end,$np,$num
1120	ldp	$acc2,$acc3,[sp,#8*2]
1121	stp	$acc4,$acc5,[$tp,#8*4]
1122	ldp	$acc4,$acc5,[sp,#8*4]
1123	stp	$acc6,$acc7,[$tp,#8*6]
1124	ldp	$acc6,$acc7,[sp,#8*6]
1125	add	$np,$np,#8*8
1126	mov	$topmost,xzr		// initial top-most carry
1127	mov	$tp,sp
1128	mov	$cnt,#8
1129
1130.Lsqr8x_reduction:
1131	// (*)	mul	$t0,$a0,$na0	// lo(n[0-7])*lo(t[0]*n0)
1132	mul	$t1,$a1,$na0
1133	sub	$cnt,$cnt,#1
1134	mul	$t2,$a2,$na0
1135	str	$na0,[$tp],#8		// put aside t[0]*n0 for tail processing
1136	mul	$t3,$a3,$na0
1137	// (*)	adds	xzr,$acc0,$t0
1138	subs	xzr,$acc0,#1		// (*)
1139	mul	$t0,$a4,$na0
1140	adcs	$acc0,$acc1,$t1
1141	mul	$t1,$a5,$na0
1142	adcs	$acc1,$acc2,$t2
1143	mul	$t2,$a6,$na0
1144	adcs	$acc2,$acc3,$t3
1145	mul	$t3,$a7,$na0
1146	adcs	$acc3,$acc4,$t0
1147	umulh	$t0,$a0,$na0		// hi(n[0-7])*lo(t[0]*n0)
1148	adcs	$acc4,$acc5,$t1
1149	umulh	$t1,$a1,$na0
1150	adcs	$acc5,$acc6,$t2
1151	umulh	$t2,$a2,$na0
1152	adcs	$acc6,$acc7,$t3
1153	umulh	$t3,$a3,$na0
1154	adc	$acc7,xzr,xzr
1155	adds	$acc0,$acc0,$t0
1156	umulh	$t0,$a4,$na0
1157	adcs	$acc1,$acc1,$t1
1158	umulh	$t1,$a5,$na0
1159	adcs	$acc2,$acc2,$t2
1160	umulh	$t2,$a6,$na0
1161	adcs	$acc3,$acc3,$t3
1162	umulh	$t3,$a7,$na0
1163	mul	$na0,$n0,$acc0		// next t[0]*n0
1164	adcs	$acc4,$acc4,$t0
1165	adcs	$acc5,$acc5,$t1
1166	adcs	$acc6,$acc6,$t2
1167	adc	$acc7,$acc7,$t3
1168	cbnz	$cnt,.Lsqr8x_reduction
1169
1170	ldp	$t0,$t1,[$tp,#8*0]
1171	ldp	$t2,$t3,[$tp,#8*2]
1172	mov	$rp,$tp
1173	sub	$cnt,$np_end,$np	// done yet?
1174	adds	$acc0,$acc0,$t0
1175	adcs	$acc1,$acc1,$t1
1176	ldp	$t0,$t1,[$tp,#8*4]
1177	adcs	$acc2,$acc2,$t2
1178	adcs	$acc3,$acc3,$t3
1179	ldp	$t2,$t3,[$tp,#8*6]
1180	adcs	$acc4,$acc4,$t0
1181	adcs	$acc5,$acc5,$t1
1182	adcs	$acc6,$acc6,$t2
1183	adcs	$acc7,$acc7,$t3
1184	//adc	$carry,xzr,xzr		// moved below
1185	cbz	$cnt,.Lsqr8x8_post_condition
1186
1187	ldur	$n0,[$tp,#-8*8]
1188	ldp	$a0,$a1,[$np,#8*0]
1189	ldp	$a2,$a3,[$np,#8*2]
1190	ldp	$a4,$a5,[$np,#8*4]
1191	mov	$cnt,#-8*8
1192	ldp	$a6,$a7,[$np,#8*6]
1193	add	$np,$np,#8*8
1194
1195.Lsqr8x_tail:
1196	mul	$t0,$a0,$n0
1197	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
1198	mul	$t1,$a1,$n0
1199	add	$cnt,$cnt,#8
1200	mul	$t2,$a2,$n0
1201	mul	$t3,$a3,$n0
1202	adds	$acc0,$acc0,$t0
1203	mul	$t0,$a4,$n0
1204	adcs	$acc1,$acc1,$t1
1205	mul	$t1,$a5,$n0
1206	adcs	$acc2,$acc2,$t2
1207	mul	$t2,$a6,$n0
1208	adcs	$acc3,$acc3,$t3
1209	mul	$t3,$a7,$n0
1210	adcs	$acc4,$acc4,$t0
1211	umulh	$t0,$a0,$n0
1212	adcs	$acc5,$acc5,$t1
1213	umulh	$t1,$a1,$n0
1214	adcs	$acc6,$acc6,$t2
1215	umulh	$t2,$a2,$n0
1216	adcs	$acc7,$acc7,$t3
1217	umulh	$t3,$a3,$n0
1218	adc	$carry,$carry,xzr
1219	str	$acc0,[$tp],#8
1220	adds	$acc0,$acc1,$t0
1221	umulh	$t0,$a4,$n0
1222	adcs	$acc1,$acc2,$t1
1223	umulh	$t1,$a5,$n0
1224	adcs	$acc2,$acc3,$t2
1225	umulh	$t2,$a6,$n0
1226	adcs	$acc3,$acc4,$t3
1227	umulh	$t3,$a7,$n0
1228	ldr	$n0,[$rp,$cnt]
1229	adcs	$acc4,$acc5,$t0
1230	adcs	$acc5,$acc6,$t1
1231	adcs	$acc6,$acc7,$t2
1232	adcs	$acc7,$carry,$t3
1233	//adc	$carry,xzr,xzr		// moved above
1234	cbnz	$cnt,.Lsqr8x_tail
1235					// note that carry flag is guaranteed
1236					// to be zero at this point
1237	ldp	$a0,$a1,[$tp,#8*0]
1238	sub	$cnt,$np_end,$np	// done yet?
1239	sub	$t2,$np_end,$num	// rewinded np
1240	ldp	$a2,$a3,[$tp,#8*2]
1241	ldp	$a4,$a5,[$tp,#8*4]
1242	ldp	$a6,$a7,[$tp,#8*6]
1243	cbz	$cnt,.Lsqr8x_tail_break
1244
1245	ldur	$n0,[$rp,#-8*8]
1246	adds	$acc0,$acc0,$a0
1247	adcs	$acc1,$acc1,$a1
1248	ldp	$a0,$a1,[$np,#8*0]
1249	adcs	$acc2,$acc2,$a2
1250	adcs	$acc3,$acc3,$a3
1251	ldp	$a2,$a3,[$np,#8*2]
1252	adcs	$acc4,$acc4,$a4
1253	adcs	$acc5,$acc5,$a5
1254	ldp	$a4,$a5,[$np,#8*4]
1255	adcs	$acc6,$acc6,$a6
1256	mov	$cnt,#-8*8
1257	adcs	$acc7,$acc7,$a7
1258	ldp	$a6,$a7,[$np,#8*6]
1259	add	$np,$np,#8*8
1260	//adc	$carry,xzr,xzr		// moved above
1261	b	.Lsqr8x_tail
1262
1263.align	4
1264.Lsqr8x_tail_break:
1265	ldr	$n0,[x29,#112]		// pull n0
1266	add	$cnt,$tp,#8*8		// end of current t[num] window
1267
1268	subs	xzr,$topmost,#1		// "move" top-most carry to carry bit
1269	adcs	$t0,$acc0,$a0
1270	adcs	$t1,$acc1,$a1
1271	ldp	$acc0,$acc1,[$rp,#8*0]
1272	adcs	$acc2,$acc2,$a2
1273	ldp	$a0,$a1,[$t2,#8*0]	// recall that $t2 is &n[0]
1274	adcs	$acc3,$acc3,$a3
1275	ldp	$a2,$a3,[$t2,#8*2]
1276	adcs	$acc4,$acc4,$a4
1277	adcs	$acc5,$acc5,$a5
1278	ldp	$a4,$a5,[$t2,#8*4]
1279	adcs	$acc6,$acc6,$a6
1280	adcs	$acc7,$acc7,$a7
1281	ldp	$a6,$a7,[$t2,#8*6]
1282	add	$np,$t2,#8*8
1283	adc	$topmost,xzr,xzr	// top-most carry
1284	mul	$na0,$n0,$acc0
1285	stp	$t0,$t1,[$tp,#8*0]
1286	stp	$acc2,$acc3,[$tp,#8*2]
1287	ldp	$acc2,$acc3,[$rp,#8*2]
1288	stp	$acc4,$acc5,[$tp,#8*4]
1289	ldp	$acc4,$acc5,[$rp,#8*4]
1290	cmp	$cnt,x29		// did we hit the bottom?
1291	stp	$acc6,$acc7,[$tp,#8*6]
1292	mov	$tp,$rp			// slide the window
1293	ldp	$acc6,$acc7,[$rp,#8*6]
1294	mov	$cnt,#8
1295	b.ne	.Lsqr8x_reduction
1296
1297	// Final step. We see if result is larger than modulus, and
1298	// if it is, subtract the modulus. But comparison implies
1299	// subtraction. So we subtract modulus, see if it borrowed,
1300	// and conditionally copy original value.
1301	ldr	$rp,[x29,#96]		// pull rp
1302	add	$tp,$tp,#8*8
1303	subs	$t0,$acc0,$a0
1304	sbcs	$t1,$acc1,$a1
1305	sub	$cnt,$num,#8*8
1306	mov	$ap_end,$rp		// $rp copy
1307
1308.Lsqr8x_sub:
1309	sbcs	$t2,$acc2,$a2
1310	ldp	$a0,$a1,[$np,#8*0]
1311	sbcs	$t3,$acc3,$a3
1312	stp	$t0,$t1,[$rp,#8*0]
1313	sbcs	$t0,$acc4,$a4
1314	ldp	$a2,$a3,[$np,#8*2]
1315	sbcs	$t1,$acc5,$a5
1316	stp	$t2,$t3,[$rp,#8*2]
1317	sbcs	$t2,$acc6,$a6
1318	ldp	$a4,$a5,[$np,#8*4]
1319	sbcs	$t3,$acc7,$a7
1320	ldp	$a6,$a7,[$np,#8*6]
1321	add	$np,$np,#8*8
1322	ldp	$acc0,$acc1,[$tp,#8*0]
1323	sub	$cnt,$cnt,#8*8
1324	ldp	$acc2,$acc3,[$tp,#8*2]
1325	ldp	$acc4,$acc5,[$tp,#8*4]
1326	ldp	$acc6,$acc7,[$tp,#8*6]
1327	add	$tp,$tp,#8*8
1328	stp	$t0,$t1,[$rp,#8*4]
1329	sbcs	$t0,$acc0,$a0
1330	stp	$t2,$t3,[$rp,#8*6]
1331	add	$rp,$rp,#8*8
1332	sbcs	$t1,$acc1,$a1
1333	cbnz	$cnt,.Lsqr8x_sub
1334
1335	sbcs	$t2,$acc2,$a2
1336	 mov	$tp,sp
1337	 add	$ap,sp,$num
1338	 ldp	$a0,$a1,[$ap_end,#8*0]
1339	sbcs	$t3,$acc3,$a3
1340	stp	$t0,$t1,[$rp,#8*0]
1341	sbcs	$t0,$acc4,$a4
1342	 ldp	$a2,$a3,[$ap_end,#8*2]
1343	sbcs	$t1,$acc5,$a5
1344	stp	$t2,$t3,[$rp,#8*2]
1345	sbcs	$t2,$acc6,$a6
1346	 ldp	$acc0,$acc1,[$ap,#8*0]
1347	sbcs	$t3,$acc7,$a7
1348	 ldp	$acc2,$acc3,[$ap,#8*2]
1349	sbcs	xzr,$topmost,xzr	// did it borrow?
1350	ldr	x30,[x29,#8]		// pull return address
1351	stp	$t0,$t1,[$rp,#8*4]
1352	stp	$t2,$t3,[$rp,#8*6]
1353
1354	sub	$cnt,$num,#8*4
1355.Lsqr4x_cond_copy:
1356	sub	$cnt,$cnt,#8*4
1357	csel	$t0,$acc0,$a0,lo
1358	 stp	xzr,xzr,[$tp,#8*0]
1359	csel	$t1,$acc1,$a1,lo
1360	ldp	$a0,$a1,[$ap_end,#8*4]
1361	ldp	$acc0,$acc1,[$ap,#8*4]
1362	csel	$t2,$acc2,$a2,lo
1363	 stp	xzr,xzr,[$tp,#8*2]
1364	 add	$tp,$tp,#8*4
1365	csel	$t3,$acc3,$a3,lo
1366	ldp	$a2,$a3,[$ap_end,#8*6]
1367	ldp	$acc2,$acc3,[$ap,#8*6]
1368	add	$ap,$ap,#8*4
1369	stp	$t0,$t1,[$ap_end,#8*0]
1370	stp	$t2,$t3,[$ap_end,#8*2]
1371	add	$ap_end,$ap_end,#8*4
1372	 stp	xzr,xzr,[$ap,#8*0]
1373	 stp	xzr,xzr,[$ap,#8*2]
1374	cbnz	$cnt,.Lsqr4x_cond_copy
1375
1376	csel	$t0,$acc0,$a0,lo
1377	 stp	xzr,xzr,[$tp,#8*0]
1378	csel	$t1,$acc1,$a1,lo
1379	 stp	xzr,xzr,[$tp,#8*2]
1380	csel	$t2,$acc2,$a2,lo
1381	csel	$t3,$acc3,$a3,lo
1382	stp	$t0,$t1,[$ap_end,#8*0]
1383	stp	$t2,$t3,[$ap_end,#8*2]
1384
1385	b	.Lsqr8x_done
1386
1387.align	4
1388.Lsqr8x8_post_condition:
1389	adc	$carry,xzr,xzr
1390	ldr	x30,[x29,#8]		// pull return address
1391	// $acc0-7,$carry hold result, $a0-7 hold modulus
1392	subs	$a0,$acc0,$a0
1393	ldr	$ap,[x29,#96]		// pull rp
1394	sbcs	$a1,$acc1,$a1
1395	 stp	xzr,xzr,[sp,#8*0]
1396	sbcs	$a2,$acc2,$a2
1397	 stp	xzr,xzr,[sp,#8*2]
1398	sbcs	$a3,$acc3,$a3
1399	 stp	xzr,xzr,[sp,#8*4]
1400	sbcs	$a4,$acc4,$a4
1401	 stp	xzr,xzr,[sp,#8*6]
1402	sbcs	$a5,$acc5,$a5
1403	 stp	xzr,xzr,[sp,#8*8]
1404	sbcs	$a6,$acc6,$a6
1405	 stp	xzr,xzr,[sp,#8*10]
1406	sbcs	$a7,$acc7,$a7
1407	 stp	xzr,xzr,[sp,#8*12]
1408	sbcs	$carry,$carry,xzr	// did it borrow?
1409	 stp	xzr,xzr,[sp,#8*14]
1410
1411	// $a0-7 hold result-modulus
1412	csel	$a0,$acc0,$a0,lo
1413	csel	$a1,$acc1,$a1,lo
1414	csel	$a2,$acc2,$a2,lo
1415	csel	$a3,$acc3,$a3,lo
1416	stp	$a0,$a1,[$ap,#8*0]
1417	csel	$a4,$acc4,$a4,lo
1418	csel	$a5,$acc5,$a5,lo
1419	stp	$a2,$a3,[$ap,#8*2]
1420	csel	$a6,$acc6,$a6,lo
1421	csel	$a7,$acc7,$a7,lo
1422	stp	$a4,$a5,[$ap,#8*4]
1423	stp	$a6,$a7,[$ap,#8*6]
1424
1425.Lsqr8x_done:
1426	ldp	x19,x20,[x29,#16]
1427	mov	sp,x29
1428	ldp	x21,x22,[x29,#32]
1429	mov	x0,#1
1430	ldp	x23,x24,[x29,#48]
1431	ldp	x25,x26,[x29,#64]
1432	ldp	x27,x28,[x29,#80]
1433	ldr	x29,[sp],#128
1434	// x30 is loaded earlier
1435	AARCH64_VALIDATE_LINK_REGISTER
1436	ret
1437.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1438___
1439}
1440
1441{
1442########################################################################
1443# Even though this might look as ARMv8 adaptation of mulx4x_mont from
1444# x86_64-mont5 module, it's different in sense that it performs
1445# reduction 256 bits at a time.
1446
1447my ($a0,$a1,$a2,$a3,
1448    $t0,$t1,$t2,$t3,
1449    $m0,$m1,$m2,$m3,
1450    $acc0,$acc1,$acc2,$acc3,$acc4,
1451    $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1452my  $bp_end=$rp;
1453my  ($carry,$topmost) = ($rp,"x30");
1454
1455$code.=<<___;
1456.type	__bn_mul4x_mont,%function
1457.align	5
1458__bn_mul4x_mont:
1459	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
1460	// only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
1461	stp	x29,x30,[sp,#-128]!
1462	add	x29,sp,#0
1463	stp	x19,x20,[sp,#16]
1464	stp	x21,x22,[sp,#32]
1465	stp	x23,x24,[sp,#48]
1466	stp	x25,x26,[sp,#64]
1467	stp	x27,x28,[sp,#80]
1468
1469	sub	$tp,sp,$num,lsl#3
1470	lsl	$num,$num,#3
1471	ldr	$n0,[$n0]		// *n0
1472	sub	sp,$tp,#8*4		// alloca
1473
1474	add	$t0,$bp,$num
1475	add	$ap_end,$ap,$num
1476	stp	$rp,$t0,[x29,#96]	// offload rp and &b[num]
1477
1478	ldr	$bi,[$bp,#8*0]		// b[0]
1479	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1480	ldp	$a2,$a3,[$ap,#8*2]
1481	add	$ap,$ap,#8*4
1482	mov	$acc0,xzr
1483	mov	$acc1,xzr
1484	mov	$acc2,xzr
1485	mov	$acc3,xzr
1486	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1487	ldp	$m2,$m3,[$np,#8*2]
1488	adds	$np,$np,#8*4		// clear carry bit
1489	mov	$carry,xzr
1490	mov	$cnt,#0
1491	mov	$tp,sp
1492
1493.Loop_mul4x_1st_reduction:
1494	mul	$t0,$a0,$bi		// lo(a[0..3]*b[0])
1495	adc	$carry,$carry,xzr	// modulo-scheduled
1496	mul	$t1,$a1,$bi
1497	add	$cnt,$cnt,#8
1498	mul	$t2,$a2,$bi
1499	and	$cnt,$cnt,#31
1500	mul	$t3,$a3,$bi
1501	adds	$acc0,$acc0,$t0
1502	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[0])
1503	adcs	$acc1,$acc1,$t1
1504	mul	$mi,$acc0,$n0		// t[0]*n0
1505	adcs	$acc2,$acc2,$t2
1506	umulh	$t1,$a1,$bi
1507	adcs	$acc3,$acc3,$t3
1508	umulh	$t2,$a2,$bi
1509	adc	$acc4,xzr,xzr
1510	umulh	$t3,$a3,$bi
1511	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1512	adds	$acc1,$acc1,$t0
1513	// (*)	mul	$t0,$m0,$mi	// lo(n[0..3]*t[0]*n0)
1514	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1515	adcs	$acc2,$acc2,$t1
1516	mul	$t1,$m1,$mi
1517	adcs	$acc3,$acc3,$t2
1518	mul	$t2,$m2,$mi
1519	adc	$acc4,$acc4,$t3		// can't overflow
1520	mul	$t3,$m3,$mi
1521	// (*)	adds	xzr,$acc0,$t0
1522	subs	xzr,$acc0,#1		// (*)
1523	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0)
1524	adcs	$acc0,$acc1,$t1
1525	umulh	$t1,$m1,$mi
1526	adcs	$acc1,$acc2,$t2
1527	umulh	$t2,$m2,$mi
1528	adcs	$acc2,$acc3,$t3
1529	umulh	$t3,$m3,$mi
1530	adcs	$acc3,$acc4,$carry
1531	adc	$carry,xzr,xzr
1532	adds	$acc0,$acc0,$t0
1533	sub	$t0,$ap_end,$ap
1534	adcs	$acc1,$acc1,$t1
1535	adcs	$acc2,$acc2,$t2
1536	adcs	$acc3,$acc3,$t3
1537	//adc	$carry,$carry,xzr
1538	cbnz	$cnt,.Loop_mul4x_1st_reduction
1539
1540	cbz	$t0,.Lmul4x4_post_condition
1541
1542	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1543	ldp	$a2,$a3,[$ap,#8*2]
1544	add	$ap,$ap,#8*4
1545	ldr	$mi,[sp]		// a[0]*n0
1546	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1547	ldp	$m2,$m3,[$np,#8*2]
1548	add	$np,$np,#8*4
1549
1550.Loop_mul4x_1st_tail:
1551	mul	$t0,$a0,$bi		// lo(a[4..7]*b[i])
1552	adc	$carry,$carry,xzr	// modulo-scheduled
1553	mul	$t1,$a1,$bi
1554	add	$cnt,$cnt,#8
1555	mul	$t2,$a2,$bi
1556	and	$cnt,$cnt,#31
1557	mul	$t3,$a3,$bi
1558	adds	$acc0,$acc0,$t0
1559	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[i])
1560	adcs	$acc1,$acc1,$t1
1561	umulh	$t1,$a1,$bi
1562	adcs	$acc2,$acc2,$t2
1563	umulh	$t2,$a2,$bi
1564	adcs	$acc3,$acc3,$t3
1565	umulh	$t3,$a3,$bi
1566	adc	$acc4,xzr,xzr
1567	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1568	adds	$acc1,$acc1,$t0
1569	mul	$t0,$m0,$mi		// lo(n[4..7]*a[0]*n0)
1570	adcs	$acc2,$acc2,$t1
1571	mul	$t1,$m1,$mi
1572	adcs	$acc3,$acc3,$t2
1573	mul	$t2,$m2,$mi
1574	adc	$acc4,$acc4,$t3		// can't overflow
1575	mul	$t3,$m3,$mi
1576	adds	$acc0,$acc0,$t0
1577	umulh	$t0,$m0,$mi		// hi(n[4..7]*a[0]*n0)
1578	adcs	$acc1,$acc1,$t1
1579	umulh	$t1,$m1,$mi
1580	adcs	$acc2,$acc2,$t2
1581	umulh	$t2,$m2,$mi
1582	adcs	$acc3,$acc3,$t3
1583	adcs	$acc4,$acc4,$carry
1584	umulh	$t3,$m3,$mi
1585	adc	$carry,xzr,xzr
1586	ldr	$mi,[sp,$cnt]		// next t[0]*n0
1587	str	$acc0,[$tp],#8		// result!!!
1588	adds	$acc0,$acc1,$t0
1589	sub	$t0,$ap_end,$ap		// done yet?
1590	adcs	$acc1,$acc2,$t1
1591	adcs	$acc2,$acc3,$t2
1592	adcs	$acc3,$acc4,$t3
1593	//adc	$carry,$carry,xzr
1594	cbnz	$cnt,.Loop_mul4x_1st_tail
1595
1596	sub	$t1,$ap_end,$num	// rewinded $ap
1597	cbz	$t0,.Lmul4x_proceed
1598
1599	ldp	$a0,$a1,[$ap,#8*0]
1600	ldp	$a2,$a3,[$ap,#8*2]
1601	add	$ap,$ap,#8*4
1602	ldp	$m0,$m1,[$np,#8*0]
1603	ldp	$m2,$m3,[$np,#8*2]
1604	add	$np,$np,#8*4
1605	b	.Loop_mul4x_1st_tail
1606
1607.align	5
1608.Lmul4x_proceed:
1609	ldr	$bi,[$bp,#8*4]!		// *++b
1610	adc	$topmost,$carry,xzr
1611	ldp	$a0,$a1,[$t1,#8*0]	// a[0..3]
1612	sub	$np,$np,$num		// rewind np
1613	ldp	$a2,$a3,[$t1,#8*2]
1614	add	$ap,$t1,#8*4
1615
1616	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1617	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1618	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1619	ldp	$acc2,$acc3,[sp,#8*6]
1620
1621	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1622	mov	$tp,sp
1623	ldp	$m2,$m3,[$np,#8*2]
1624	adds	$np,$np,#8*4		// clear carry bit
1625	mov	$carry,xzr
1626
1627.align	4
1628.Loop_mul4x_reduction:
1629	mul	$t0,$a0,$bi		// lo(a[0..3]*b[4])
1630	adc	$carry,$carry,xzr	// modulo-scheduled
1631	mul	$t1,$a1,$bi
1632	add	$cnt,$cnt,#8
1633	mul	$t2,$a2,$bi
1634	and	$cnt,$cnt,#31
1635	mul	$t3,$a3,$bi
1636	adds	$acc0,$acc0,$t0
1637	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[4])
1638	adcs	$acc1,$acc1,$t1
1639	mul	$mi,$acc0,$n0		// t[0]*n0
1640	adcs	$acc2,$acc2,$t2
1641	umulh	$t1,$a1,$bi
1642	adcs	$acc3,$acc3,$t3
1643	umulh	$t2,$a2,$bi
1644	adc	$acc4,xzr,xzr
1645	umulh	$t3,$a3,$bi
1646	ldr	$bi,[$bp,$cnt]		// next b[i]
1647	adds	$acc1,$acc1,$t0
1648	// (*)	mul	$t0,$m0,$mi
1649	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1650	adcs	$acc2,$acc2,$t1
1651	mul	$t1,$m1,$mi		// lo(n[0..3]*t[0]*n0
1652	adcs	$acc3,$acc3,$t2
1653	mul	$t2,$m2,$mi
1654	adc	$acc4,$acc4,$t3		// can't overflow
1655	mul	$t3,$m3,$mi
1656	// (*)	adds	xzr,$acc0,$t0
1657	subs	xzr,$acc0,#1		// (*)
1658	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0
1659	adcs	$acc0,$acc1,$t1
1660	umulh	$t1,$m1,$mi
1661	adcs	$acc1,$acc2,$t2
1662	umulh	$t2,$m2,$mi
1663	adcs	$acc2,$acc3,$t3
1664	umulh	$t3,$m3,$mi
1665	adcs	$acc3,$acc4,$carry
1666	adc	$carry,xzr,xzr
1667	adds	$acc0,$acc0,$t0
1668	adcs	$acc1,$acc1,$t1
1669	adcs	$acc2,$acc2,$t2
1670	adcs	$acc3,$acc3,$t3
1671	//adc	$carry,$carry,xzr
1672	cbnz	$cnt,.Loop_mul4x_reduction
1673
1674	adc	$carry,$carry,xzr
1675	ldp	$t0,$t1,[$tp,#8*4]	// t[4..7]
1676	ldp	$t2,$t3,[$tp,#8*6]
1677	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1678	ldp	$a2,$a3,[$ap,#8*2]
1679	add	$ap,$ap,#8*4
1680	adds	$acc0,$acc0,$t0
1681	adcs	$acc1,$acc1,$t1
1682	adcs	$acc2,$acc2,$t2
1683	adcs	$acc3,$acc3,$t3
1684	//adc	$carry,$carry,xzr
1685
1686	ldr	$mi,[sp]		// t[0]*n0
1687	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1688	ldp	$m2,$m3,[$np,#8*2]
1689	add	$np,$np,#8*4
1690
1691.align	4
1692.Loop_mul4x_tail:
1693	mul	$t0,$a0,$bi		// lo(a[4..7]*b[4])
1694	adc	$carry,$carry,xzr	// modulo-scheduled
1695	mul	$t1,$a1,$bi
1696	add	$cnt,$cnt,#8
1697	mul	$t2,$a2,$bi
1698	and	$cnt,$cnt,#31
1699	mul	$t3,$a3,$bi
1700	adds	$acc0,$acc0,$t0
1701	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[4])
1702	adcs	$acc1,$acc1,$t1
1703	umulh	$t1,$a1,$bi
1704	adcs	$acc2,$acc2,$t2
1705	umulh	$t2,$a2,$bi
1706	adcs	$acc3,$acc3,$t3
1707	umulh	$t3,$a3,$bi
1708	adc	$acc4,xzr,xzr
1709	ldr	$bi,[$bp,$cnt]		// next b[i]
1710	adds	$acc1,$acc1,$t0
1711	mul	$t0,$m0,$mi		// lo(n[4..7]*t[0]*n0)
1712	adcs	$acc2,$acc2,$t1
1713	mul	$t1,$m1,$mi
1714	adcs	$acc3,$acc3,$t2
1715	mul	$t2,$m2,$mi
1716	adc	$acc4,$acc4,$t3		// can't overflow
1717	mul	$t3,$m3,$mi
1718	adds	$acc0,$acc0,$t0
1719	umulh	$t0,$m0,$mi		// hi(n[4..7]*t[0]*n0)
1720	adcs	$acc1,$acc1,$t1
1721	umulh	$t1,$m1,$mi
1722	adcs	$acc2,$acc2,$t2
1723	umulh	$t2,$m2,$mi
1724	adcs	$acc3,$acc3,$t3
1725	umulh	$t3,$m3,$mi
1726	adcs	$acc4,$acc4,$carry
1727	ldr	$mi,[sp,$cnt]		// next a[0]*n0
1728	adc	$carry,xzr,xzr
1729	str	$acc0,[$tp],#8		// result!!!
1730	adds	$acc0,$acc1,$t0
1731	sub	$t0,$ap_end,$ap		// done yet?
1732	adcs	$acc1,$acc2,$t1
1733	adcs	$acc2,$acc3,$t2
1734	adcs	$acc3,$acc4,$t3
1735	//adc	$carry,$carry,xzr
1736	cbnz	$cnt,.Loop_mul4x_tail
1737
1738	sub	$t1,$np,$num		// rewinded np?
1739	adc	$carry,$carry,xzr
1740	cbz	$t0,.Loop_mul4x_break
1741
1742	ldp	$t0,$t1,[$tp,#8*4]
1743	ldp	$t2,$t3,[$tp,#8*6]
1744	ldp	$a0,$a1,[$ap,#8*0]
1745	ldp	$a2,$a3,[$ap,#8*2]
1746	add	$ap,$ap,#8*4
1747	adds	$acc0,$acc0,$t0
1748	adcs	$acc1,$acc1,$t1
1749	adcs	$acc2,$acc2,$t2
1750	adcs	$acc3,$acc3,$t3
1751	//adc	$carry,$carry,xzr
1752	ldp	$m0,$m1,[$np,#8*0]
1753	ldp	$m2,$m3,[$np,#8*2]
1754	add	$np,$np,#8*4
1755	b	.Loop_mul4x_tail
1756
1757.align	4
1758.Loop_mul4x_break:
1759	ldp	$t2,$t3,[x29,#96]	// pull rp and &b[num]
1760	adds	$acc0,$acc0,$topmost
1761	add	$bp,$bp,#8*4		// bp++
1762	adcs	$acc1,$acc1,xzr
1763	sub	$ap,$ap,$num		// rewind ap
1764	adcs	$acc2,$acc2,xzr
1765	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1766	adcs	$acc3,$acc3,xzr
1767	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1768	adc	$topmost,$carry,xzr
1769	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1770	cmp	$bp,$t3			// done yet?
1771	ldp	$acc2,$acc3,[sp,#8*6]
1772	ldp	$m0,$m1,[$t1,#8*0]	// n[0..3]
1773	ldp	$m2,$m3,[$t1,#8*2]
1774	add	$np,$t1,#8*4
1775	b.eq	.Lmul4x_post
1776
1777	ldr	$bi,[$bp]
1778	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1779	ldp	$a2,$a3,[$ap,#8*2]
1780	adds	$ap,$ap,#8*4		// clear carry bit
1781	mov	$carry,xzr
1782	mov	$tp,sp
1783	b	.Loop_mul4x_reduction
1784
1785.align	4
1786.Lmul4x_post:
1787	// Final step. We see if result is larger than modulus, and
1788	// if it is, subtract the modulus. But comparison implies
1789	// subtraction. So we subtract modulus, see if it borrowed,
1790	// and conditionally copy original value.
1791	mov	$rp,$t2
1792	mov	$ap_end,$t2		// $rp copy
1793	subs	$t0,$acc0,$m0
1794	add	$tp,sp,#8*8
1795	sbcs	$t1,$acc1,$m1
1796	sub	$cnt,$num,#8*4
1797
1798.Lmul4x_sub:
1799	sbcs	$t2,$acc2,$m2
1800	ldp	$m0,$m1,[$np,#8*0]
1801	sub	$cnt,$cnt,#8*4
1802	ldp	$acc0,$acc1,[$tp,#8*0]
1803	sbcs	$t3,$acc3,$m3
1804	ldp	$m2,$m3,[$np,#8*2]
1805	add	$np,$np,#8*4
1806	ldp	$acc2,$acc3,[$tp,#8*2]
1807	add	$tp,$tp,#8*4
1808	stp	$t0,$t1,[$rp,#8*0]
1809	sbcs	$t0,$acc0,$m0
1810	stp	$t2,$t3,[$rp,#8*2]
1811	add	$rp,$rp,#8*4
1812	sbcs	$t1,$acc1,$m1
1813	cbnz	$cnt,.Lmul4x_sub
1814
1815	sbcs	$t2,$acc2,$m2
1816	 mov	$tp,sp
1817	 add	$ap,sp,#8*4
1818	 ldp	$a0,$a1,[$ap_end,#8*0]
1819	sbcs	$t3,$acc3,$m3
1820	stp	$t0,$t1,[$rp,#8*0]
1821	 ldp	$a2,$a3,[$ap_end,#8*2]
1822	stp	$t2,$t3,[$rp,#8*2]
1823	 ldp	$acc0,$acc1,[$ap,#8*0]
1824	 ldp	$acc2,$acc3,[$ap,#8*2]
1825	sbcs	xzr,$topmost,xzr	// did it borrow?
1826	ldr	x30,[x29,#8]		// pull return address
1827
1828	sub	$cnt,$num,#8*4
1829.Lmul4x_cond_copy:
1830	sub	$cnt,$cnt,#8*4
1831	csel	$t0,$acc0,$a0,lo
1832	 stp	xzr,xzr,[$tp,#8*0]
1833	csel	$t1,$acc1,$a1,lo
1834	ldp	$a0,$a1,[$ap_end,#8*4]
1835	ldp	$acc0,$acc1,[$ap,#8*4]
1836	csel	$t2,$acc2,$a2,lo
1837	 stp	xzr,xzr,[$tp,#8*2]
1838	 add	$tp,$tp,#8*4
1839	csel	$t3,$acc3,$a3,lo
1840	ldp	$a2,$a3,[$ap_end,#8*6]
1841	ldp	$acc2,$acc3,[$ap,#8*6]
1842	add	$ap,$ap,#8*4
1843	stp	$t0,$t1,[$ap_end,#8*0]
1844	stp	$t2,$t3,[$ap_end,#8*2]
1845	add	$ap_end,$ap_end,#8*4
1846	cbnz	$cnt,.Lmul4x_cond_copy
1847
1848	csel	$t0,$acc0,$a0,lo
1849	 stp	xzr,xzr,[$tp,#8*0]
1850	csel	$t1,$acc1,$a1,lo
1851	 stp	xzr,xzr,[$tp,#8*2]
1852	csel	$t2,$acc2,$a2,lo
1853	 stp	xzr,xzr,[$tp,#8*3]
1854	csel	$t3,$acc3,$a3,lo
1855	 stp	xzr,xzr,[$tp,#8*4]
1856	stp	$t0,$t1,[$ap_end,#8*0]
1857	stp	$t2,$t3,[$ap_end,#8*2]
1858
1859	b	.Lmul4x_done
1860
1861.align	4
1862.Lmul4x4_post_condition:
1863	adc	$carry,$carry,xzr
1864	ldr	$ap,[x29,#96]		// pull rp
1865	// $acc0-3,$carry hold result, $m0-7 hold modulus
1866	subs	$a0,$acc0,$m0
1867	ldr	x30,[x29,#8]		// pull return address
1868	sbcs	$a1,$acc1,$m1
1869	 stp	xzr,xzr,[sp,#8*0]
1870	sbcs	$a2,$acc2,$m2
1871	 stp	xzr,xzr,[sp,#8*2]
1872	sbcs	$a3,$acc3,$m3
1873	 stp	xzr,xzr,[sp,#8*4]
1874	sbcs	xzr,$carry,xzr		// did it borrow?
1875	 stp	xzr,xzr,[sp,#8*6]
1876
1877	// $a0-3 hold result-modulus
1878	csel	$a0,$acc0,$a0,lo
1879	csel	$a1,$acc1,$a1,lo
1880	csel	$a2,$acc2,$a2,lo
1881	csel	$a3,$acc3,$a3,lo
1882	stp	$a0,$a1,[$ap,#8*0]
1883	stp	$a2,$a3,[$ap,#8*2]
1884
1885.Lmul4x_done:
1886	ldp	x19,x20,[x29,#16]
1887	mov	sp,x29
1888	ldp	x21,x22,[x29,#32]
1889	mov	x0,#1
1890	ldp	x23,x24,[x29,#48]
1891	ldp	x25,x26,[x29,#64]
1892	ldp	x27,x28,[x29,#80]
1893	ldr	x29,[sp],#128
1894	// x30 loaded earlier
1895	AARCH64_VALIDATE_LINK_REGISTER
1896	ret
1897.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1898___
1899}
1900$code.=<<___;
1901.asciz	"Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1902.align	4
1903___
1904
1905print $code;
1906
1907close STDOUT or die "error closing STDOUT: $!";
1908