1#! /usr/bin/env perl
2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2015
18#
19# "Teaser" Montgomery multiplication module for ARMv8. Needs more
20# work. While it does improve RSA sign performance by 20-30% (less for
21# longer keys) on most processors, for some reason RSA2048 is not
22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23# instruction issue rate is limited on processor in question, meaning
24# that dedicated squaring procedure is a must. Well, actually all
25# contemporary AArch64 processors seem to have limited multiplication
26# issue rate, i.e. they can't issue multiplication every cycle, which
27# explains moderate improvement coefficients in comparison to
28# compiler-generated code. Recall that compiler is instructed to use
29# umulh and therefore uses same amount of multiplication instructions
30# to do the job. Assembly's edge is to minimize number of "collateral"
31# instructions and of course instruction scheduling.
32#
33# April 2015
34#
35# Squaring procedure that handles lengths divisible by 8 improves
36# RSA/DSA performance by 25-40-60% depending on processor and key
37# length. Overall improvement coefficients are always positive in
38# comparison to compiler-generated code. On Cortex-A57 improvement
39# is still modest on longest key lengths, while others exhibit e.g.
40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41# on Cortex-A57 and ~60-100% faster on others.
42
43$flavour = shift;
44$output  = shift;
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49die "can't locate arm-xlate.pl";
50
51open OUT,"| \"$^X\" $xlate $flavour $output";
52*STDOUT=*OUT;
53
54($lo0,$hi0,$aj,$m0,$alo,$ahi,
55 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
56 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
57
58# int bn_mul_mont(
59$rp="x0";	# BN_ULONG *rp,
60$ap="x1";	# const BN_ULONG *ap,
61$bp="x2";	# const BN_ULONG *bp,
62$np="x3";	# const BN_ULONG *np,
63$n0="x4";	# const BN_ULONG *n0,
64$num="x5";	# int num);
65
66$code.=<<___;
67.text
68
69.globl	bn_mul_mont
70.type	bn_mul_mont,%function
71.align	5
72bn_mul_mont:
73	tst	$num,#7
74	b.eq	__bn_sqr8x_mont
75	tst	$num,#3
76	b.eq	__bn_mul4x_mont
77.Lmul_mont:
78	stp	x29,x30,[sp,#-64]!
79	add	x29,sp,#0
80	stp	x19,x20,[sp,#16]
81	stp	x21,x22,[sp,#32]
82	stp	x23,x24,[sp,#48]
83
84	ldr	$m0,[$bp],#8		// bp[0]
85	sub	$tp,sp,$num,lsl#3
86	ldp	$hi0,$aj,[$ap],#16	// ap[0..1]
87	lsl	$num,$num,#3
88	ldr	$n0,[$n0]		// *n0
89	and	$tp,$tp,#-16		// ABI says so
90	ldp	$hi1,$nj,[$np],#16	// np[0..1]
91
92	mul	$lo0,$hi0,$m0		// ap[0]*bp[0]
93	sub	$j,$num,#16		// j=num-2
94	umulh	$hi0,$hi0,$m0
95	mul	$alo,$aj,$m0		// ap[1]*bp[0]
96	umulh	$ahi,$aj,$m0
97
98	mul	$m1,$lo0,$n0		// "tp[0]"*n0
99	mov	sp,$tp			// alloca
100
101	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
102	umulh	$hi1,$hi1,$m1
103	mul	$nlo,$nj,$m1		// np[1]*m1
104	// (*)	adds	$lo1,$lo1,$lo0	// discarded
105	// (*)	As for removal of first multiplication and addition
106	//	instructions. The outcome of first addition is
107	//	guaranteed to be zero, which leaves two computationally
108	//	significant outcomes: it either carries or not. Then
109	//	question is when does it carry? Is there alternative
110	//	way to deduce it? If you follow operations, you can
111	//	observe that condition for carry is quite simple:
112	//	$lo0 being non-zero. So that carry can be calculated
113	//	by adding -1 to $lo0. That's what next instruction does.
114	subs	xzr,$lo0,#1		// (*)
115	umulh	$nhi,$nj,$m1
116	adc	$hi1,$hi1,xzr
117	cbz	$j,.L1st_skip
118
119.L1st:
120	ldr	$aj,[$ap],#8
121	adds	$lo0,$alo,$hi0
122	sub	$j,$j,#8		// j--
123	adc	$hi0,$ahi,xzr
124
125	ldr	$nj,[$np],#8
126	adds	$lo1,$nlo,$hi1
127	mul	$alo,$aj,$m0		// ap[j]*bp[0]
128	adc	$hi1,$nhi,xzr
129	umulh	$ahi,$aj,$m0
130
131	adds	$lo1,$lo1,$lo0
132	mul	$nlo,$nj,$m1		// np[j]*m1
133	adc	$hi1,$hi1,xzr
134	umulh	$nhi,$nj,$m1
135	str	$lo1,[$tp],#8		// tp[j-1]
136	cbnz	$j,.L1st
137
138.L1st_skip:
139	adds	$lo0,$alo,$hi0
140	sub	$ap,$ap,$num		// rewind $ap
141	adc	$hi0,$ahi,xzr
142
143	adds	$lo1,$nlo,$hi1
144	sub	$np,$np,$num		// rewind $np
145	adc	$hi1,$nhi,xzr
146
147	adds	$lo1,$lo1,$lo0
148	sub	$i,$num,#8		// i=num-1
149	adcs	$hi1,$hi1,$hi0
150
151	adc	$ovf,xzr,xzr		// upmost overflow bit
152	stp	$lo1,$hi1,[$tp]
153
154.Louter:
155	ldr	$m0,[$bp],#8		// bp[i]
156	ldp	$hi0,$aj,[$ap],#16
157	ldr	$tj,[sp]		// tp[0]
158	add	$tp,sp,#8
159
160	mul	$lo0,$hi0,$m0		// ap[0]*bp[i]
161	sub	$j,$num,#16		// j=num-2
162	umulh	$hi0,$hi0,$m0
163	ldp	$hi1,$nj,[$np],#16
164	mul	$alo,$aj,$m0		// ap[1]*bp[i]
165	adds	$lo0,$lo0,$tj
166	umulh	$ahi,$aj,$m0
167	adc	$hi0,$hi0,xzr
168
169	mul	$m1,$lo0,$n0
170	sub	$i,$i,#8		// i--
171
172	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
173	umulh	$hi1,$hi1,$m1
174	mul	$nlo,$nj,$m1		// np[1]*m1
175	// (*)	adds	$lo1,$lo1,$lo0
176	subs	xzr,$lo0,#1		// (*)
177	umulh	$nhi,$nj,$m1
178	cbz	$j,.Linner_skip
179
180.Linner:
181	ldr	$aj,[$ap],#8
182	adc	$hi1,$hi1,xzr
183	ldr	$tj,[$tp],#8		// tp[j]
184	adds	$lo0,$alo,$hi0
185	sub	$j,$j,#8		// j--
186	adc	$hi0,$ahi,xzr
187
188	adds	$lo1,$nlo,$hi1
189	ldr	$nj,[$np],#8
190	adc	$hi1,$nhi,xzr
191
192	mul	$alo,$aj,$m0		// ap[j]*bp[i]
193	adds	$lo0,$lo0,$tj
194	umulh	$ahi,$aj,$m0
195	adc	$hi0,$hi0,xzr
196
197	mul	$nlo,$nj,$m1		// np[j]*m1
198	adds	$lo1,$lo1,$lo0
199	umulh	$nhi,$nj,$m1
200	str	$lo1,[$tp,#-16]		// tp[j-1]
201	cbnz	$j,.Linner
202
203.Linner_skip:
204	ldr	$tj,[$tp],#8		// tp[j]
205	adc	$hi1,$hi1,xzr
206	adds	$lo0,$alo,$hi0
207	sub	$ap,$ap,$num		// rewind $ap
208	adc	$hi0,$ahi,xzr
209
210	adds	$lo1,$nlo,$hi1
211	sub	$np,$np,$num		// rewind $np
212	adcs	$hi1,$nhi,$ovf
213	adc	$ovf,xzr,xzr
214
215	adds	$lo0,$lo0,$tj
216	adc	$hi0,$hi0,xzr
217
218	adds	$lo1,$lo1,$lo0
219	adcs	$hi1,$hi1,$hi0
220	adc	$ovf,$ovf,xzr		// upmost overflow bit
221	stp	$lo1,$hi1,[$tp,#-16]
222
223	cbnz	$i,.Louter
224
225	// Final step. We see if result is larger than modulus, and
226	// if it is, subtract the modulus. But comparison implies
227	// subtraction. So we subtract modulus, see if it borrowed,
228	// and conditionally copy original value.
229	ldr	$tj,[sp]		// tp[0]
230	add	$tp,sp,#8
231	ldr	$nj,[$np],#8		// np[0]
232	subs	$j,$num,#8		// j=num-1 and clear borrow
233	mov	$ap,$rp
234.Lsub:
235	sbcs	$aj,$tj,$nj		// tp[j]-np[j]
236	ldr	$tj,[$tp],#8
237	sub	$j,$j,#8		// j--
238	ldr	$nj,[$np],#8
239	str	$aj,[$ap],#8		// rp[j]=tp[j]-np[j]
240	cbnz	$j,.Lsub
241
242	sbcs	$aj,$tj,$nj
243	sbcs	$ovf,$ovf,xzr		// did it borrow?
244	str	$aj,[$ap],#8		// rp[num-1]
245
246	ldr	$tj,[sp]		// tp[0]
247	add	$tp,sp,#8
248	ldr	$aj,[$rp],#8		// rp[0]
249	sub	$num,$num,#8		// num--
250	nop
251.Lcond_copy:
252	sub	$num,$num,#8		// num--
253	csel	$nj,$tj,$aj,lo		// did it borrow?
254	ldr	$tj,[$tp],#8
255	ldr	$aj,[$rp],#8
256	str	xzr,[$tp,#-16]		// wipe tp
257	str	$nj,[$rp,#-16]
258	cbnz	$num,.Lcond_copy
259
260	csel	$nj,$tj,$aj,lo
261	str	xzr,[$tp,#-8]		// wipe tp
262	str	$nj,[$rp,#-8]
263
264	ldp	x19,x20,[x29,#16]
265	mov	sp,x29
266	ldp	x21,x22,[x29,#32]
267	mov	x0,#1
268	ldp	x23,x24,[x29,#48]
269	ldr	x29,[sp],#64
270	ret
271.size	bn_mul_mont,.-bn_mul_mont
272___
273{
274########################################################################
275# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
276
277my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
278my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
279my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
280my ($cnt,$carry,$topmost)=("x27","x28","x30");
281my ($tp,$ap_end,$na0)=($bp,$np,$carry);
282
283$code.=<<___;
284.type	__bn_sqr8x_mont,%function
285.align	5
286__bn_sqr8x_mont:
287	cmp	$ap,$bp
288	b.ne	__bn_mul4x_mont
289.Lsqr8x_mont:
290	.inst	0xd503233f		// paciasp
291	stp	x29,x30,[sp,#-128]!
292	add	x29,sp,#0
293	stp	x19,x20,[sp,#16]
294	stp	x21,x22,[sp,#32]
295	stp	x23,x24,[sp,#48]
296	stp	x25,x26,[sp,#64]
297	stp	x27,x28,[sp,#80]
298	stp	$rp,$np,[sp,#96]	// offload rp and np
299
300	ldp	$a0,$a1,[$ap,#8*0]
301	ldp	$a2,$a3,[$ap,#8*2]
302	ldp	$a4,$a5,[$ap,#8*4]
303	ldp	$a6,$a7,[$ap,#8*6]
304
305	sub	$tp,sp,$num,lsl#4
306	lsl	$num,$num,#3
307	ldr	$n0,[$n0]		// *n0
308	mov	sp,$tp			// alloca
309	sub	$cnt,$num,#8*8
310	b	.Lsqr8x_zero_start
311
312.Lsqr8x_zero:
313	sub	$cnt,$cnt,#8*8
314	stp	xzr,xzr,[$tp,#8*0]
315	stp	xzr,xzr,[$tp,#8*2]
316	stp	xzr,xzr,[$tp,#8*4]
317	stp	xzr,xzr,[$tp,#8*6]
318.Lsqr8x_zero_start:
319	stp	xzr,xzr,[$tp,#8*8]
320	stp	xzr,xzr,[$tp,#8*10]
321	stp	xzr,xzr,[$tp,#8*12]
322	stp	xzr,xzr,[$tp,#8*14]
323	add	$tp,$tp,#8*16
324	cbnz	$cnt,.Lsqr8x_zero
325
326	add	$ap_end,$ap,$num
327	add	$ap,$ap,#8*8
328	mov	$acc0,xzr
329	mov	$acc1,xzr
330	mov	$acc2,xzr
331	mov	$acc3,xzr
332	mov	$acc4,xzr
333	mov	$acc5,xzr
334	mov	$acc6,xzr
335	mov	$acc7,xzr
336	mov	$tp,sp
337	str	$n0,[x29,#112]		// offload n0
338
339	// Multiply everything but a[i]*a[i]
340.align	4
341.Lsqr8x_outer_loop:
342        //                                                 a[1]a[0]	(i)
343        //                                             a[2]a[0]
344        //                                         a[3]a[0]
345        //                                     a[4]a[0]
346        //                                 a[5]a[0]
347        //                             a[6]a[0]
348        //                         a[7]a[0]
349        //                                         a[2]a[1]		(ii)
350        //                                     a[3]a[1]
351        //                                 a[4]a[1]
352        //                             a[5]a[1]
353        //                         a[6]a[1]
354        //                     a[7]a[1]
355        //                                 a[3]a[2]			(iii)
356        //                             a[4]a[2]
357        //                         a[5]a[2]
358        //                     a[6]a[2]
359        //                 a[7]a[2]
360        //                         a[4]a[3]				(iv)
361        //                     a[5]a[3]
362        //                 a[6]a[3]
363        //             a[7]a[3]
364        //                 a[5]a[4]					(v)
365        //             a[6]a[4]
366        //         a[7]a[4]
367        //         a[6]a[5]						(vi)
368        //     a[7]a[5]
369        // a[7]a[6]							(vii)
370
371	mul	$t0,$a1,$a0		// lo(a[1..7]*a[0])		(i)
372	mul	$t1,$a2,$a0
373	mul	$t2,$a3,$a0
374	mul	$t3,$a4,$a0
375	adds	$acc1,$acc1,$t0		// t[1]+lo(a[1]*a[0])
376	mul	$t0,$a5,$a0
377	adcs	$acc2,$acc2,$t1
378	mul	$t1,$a6,$a0
379	adcs	$acc3,$acc3,$t2
380	mul	$t2,$a7,$a0
381	adcs	$acc4,$acc4,$t3
382	umulh	$t3,$a1,$a0		// hi(a[1..7]*a[0])
383	adcs	$acc5,$acc5,$t0
384	umulh	$t0,$a2,$a0
385	adcs	$acc6,$acc6,$t1
386	umulh	$t1,$a3,$a0
387	adcs	$acc7,$acc7,$t2
388	umulh	$t2,$a4,$a0
389	stp	$acc0,$acc1,[$tp],#8*2	// t[0..1]
390	adc	$acc0,xzr,xzr		// t[8]
391	adds	$acc2,$acc2,$t3		// t[2]+lo(a[1]*a[0])
392	umulh	$t3,$a5,$a0
393	adcs	$acc3,$acc3,$t0
394	umulh	$t0,$a6,$a0
395	adcs	$acc4,$acc4,$t1
396	umulh	$t1,$a7,$a0
397	adcs	$acc5,$acc5,$t2
398	 mul	$t2,$a2,$a1		// lo(a[2..7]*a[1])		(ii)
399	adcs	$acc6,$acc6,$t3
400	 mul	$t3,$a3,$a1
401	adcs	$acc7,$acc7,$t0
402	 mul	$t0,$a4,$a1
403	adc	$acc0,$acc0,$t1
404
405	mul	$t1,$a5,$a1
406	adds	$acc3,$acc3,$t2
407	mul	$t2,$a6,$a1
408	adcs	$acc4,$acc4,$t3
409	mul	$t3,$a7,$a1
410	adcs	$acc5,$acc5,$t0
411	umulh	$t0,$a2,$a1		// hi(a[2..7]*a[1])
412	adcs	$acc6,$acc6,$t1
413	umulh	$t1,$a3,$a1
414	adcs	$acc7,$acc7,$t2
415	umulh	$t2,$a4,$a1
416	adcs	$acc0,$acc0,$t3
417	umulh	$t3,$a5,$a1
418	stp	$acc2,$acc3,[$tp],#8*2	// t[2..3]
419	adc	$acc1,xzr,xzr		// t[9]
420	adds	$acc4,$acc4,$t0
421	umulh	$t0,$a6,$a1
422	adcs	$acc5,$acc5,$t1
423	umulh	$t1,$a7,$a1
424	adcs	$acc6,$acc6,$t2
425	 mul	$t2,$a3,$a2		// lo(a[3..7]*a[2])		(iii)
426	adcs	$acc7,$acc7,$t3
427	 mul	$t3,$a4,$a2
428	adcs	$acc0,$acc0,$t0
429	 mul	$t0,$a5,$a2
430	adc	$acc1,$acc1,$t1
431
432	mul	$t1,$a6,$a2
433	adds	$acc5,$acc5,$t2
434	mul	$t2,$a7,$a2
435	adcs	$acc6,$acc6,$t3
436	umulh	$t3,$a3,$a2		// hi(a[3..7]*a[2])
437	adcs	$acc7,$acc7,$t0
438	umulh	$t0,$a4,$a2
439	adcs	$acc0,$acc0,$t1
440	umulh	$t1,$a5,$a2
441	adcs	$acc1,$acc1,$t2
442	umulh	$t2,$a6,$a2
443	stp	$acc4,$acc5,[$tp],#8*2	// t[4..5]
444	adc	$acc2,xzr,xzr		// t[10]
445	adds	$acc6,$acc6,$t3
446	umulh	$t3,$a7,$a2
447	adcs	$acc7,$acc7,$t0
448	 mul	$t0,$a4,$a3		// lo(a[4..7]*a[3])		(iv)
449	adcs	$acc0,$acc0,$t1
450	 mul	$t1,$a5,$a3
451	adcs	$acc1,$acc1,$t2
452	 mul	$t2,$a6,$a3
453	adc	$acc2,$acc2,$t3
454
455	mul	$t3,$a7,$a3
456	adds	$acc7,$acc7,$t0
457	umulh	$t0,$a4,$a3		// hi(a[4..7]*a[3])
458	adcs	$acc0,$acc0,$t1
459	umulh	$t1,$a5,$a3
460	adcs	$acc1,$acc1,$t2
461	umulh	$t2,$a6,$a3
462	adcs	$acc2,$acc2,$t3
463	umulh	$t3,$a7,$a3
464	stp	$acc6,$acc7,[$tp],#8*2	// t[6..7]
465	adc	$acc3,xzr,xzr		// t[11]
466	adds	$acc0,$acc0,$t0
467	 mul	$t0,$a5,$a4		// lo(a[5..7]*a[4])		(v)
468	adcs	$acc1,$acc1,$t1
469	 mul	$t1,$a6,$a4
470	adcs	$acc2,$acc2,$t2
471	 mul	$t2,$a7,$a4
472	adc	$acc3,$acc3,$t3
473
474	umulh	$t3,$a5,$a4		// hi(a[5..7]*a[4])
475	adds	$acc1,$acc1,$t0
476	umulh	$t0,$a6,$a4
477	adcs	$acc2,$acc2,$t1
478	umulh	$t1,$a7,$a4
479	adcs	$acc3,$acc3,$t2
480	 mul	$t2,$a6,$a5		// lo(a[6..7]*a[5])		(vi)
481	adc	$acc4,xzr,xzr		// t[12]
482	adds	$acc2,$acc2,$t3
483	 mul	$t3,$a7,$a5
484	adcs	$acc3,$acc3,$t0
485	 umulh	$t0,$a6,$a5		// hi(a[6..7]*a[5])
486	adc	$acc4,$acc4,$t1
487
488	umulh	$t1,$a7,$a5
489	adds	$acc3,$acc3,$t2
490	 mul	$t2,$a7,$a6		// lo(a[7]*a[6])		(vii)
491	adcs	$acc4,$acc4,$t3
492	 umulh	$t3,$a7,$a6		// hi(a[7]*a[6])
493	adc	$acc5,xzr,xzr		// t[13]
494	adds	$acc4,$acc4,$t0
495	sub	$cnt,$ap_end,$ap	// done yet?
496	adc	$acc5,$acc5,$t1
497
498	adds	$acc5,$acc5,$t2
499	sub	$t0,$ap_end,$num	// rewinded ap
500	adc	$acc6,xzr,xzr		// t[14]
501	add	$acc6,$acc6,$t3
502
503	cbz	$cnt,.Lsqr8x_outer_break
504
505	mov	$n0,$a0
506	ldp	$a0,$a1,[$tp,#8*0]
507	ldp	$a2,$a3,[$tp,#8*2]
508	ldp	$a4,$a5,[$tp,#8*4]
509	ldp	$a6,$a7,[$tp,#8*6]
510	adds	$acc0,$acc0,$a0
511	adcs	$acc1,$acc1,$a1
512	ldp	$a0,$a1,[$ap,#8*0]
513	adcs	$acc2,$acc2,$a2
514	adcs	$acc3,$acc3,$a3
515	ldp	$a2,$a3,[$ap,#8*2]
516	adcs	$acc4,$acc4,$a4
517	adcs	$acc5,$acc5,$a5
518	ldp	$a4,$a5,[$ap,#8*4]
519	adcs	$acc6,$acc6,$a6
520	mov	$rp,$ap
521	adcs	$acc7,xzr,$a7
522	ldp	$a6,$a7,[$ap,#8*6]
523	add	$ap,$ap,#8*8
524	//adc	$carry,xzr,xzr		// moved below
525	mov	$cnt,#-8*8
526
527	//                                                         a[8]a[0]
528	//                                                     a[9]a[0]
529	//                                                 a[a]a[0]
530	//                                             a[b]a[0]
531	//                                         a[c]a[0]
532	//                                     a[d]a[0]
533	//                                 a[e]a[0]
534	//                             a[f]a[0]
535	//                                                     a[8]a[1]
536	//                         a[f]a[1]........................
537	//                                                 a[8]a[2]
538	//                     a[f]a[2]........................
539	//                                             a[8]a[3]
540	//                 a[f]a[3]........................
541	//                                         a[8]a[4]
542	//             a[f]a[4]........................
543	//                                     a[8]a[5]
544	//         a[f]a[5]........................
545	//                                 a[8]a[6]
546	//     a[f]a[6]........................
547	//                             a[8]a[7]
548	// a[f]a[7]........................
549.Lsqr8x_mul:
550	mul	$t0,$a0,$n0
551	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
552	mul	$t1,$a1,$n0
553	add	$cnt,$cnt,#8
554	mul	$t2,$a2,$n0
555	mul	$t3,$a3,$n0
556	adds	$acc0,$acc0,$t0
557	mul	$t0,$a4,$n0
558	adcs	$acc1,$acc1,$t1
559	mul	$t1,$a5,$n0
560	adcs	$acc2,$acc2,$t2
561	mul	$t2,$a6,$n0
562	adcs	$acc3,$acc3,$t3
563	mul	$t3,$a7,$n0
564	adcs	$acc4,$acc4,$t0
565	umulh	$t0,$a0,$n0
566	adcs	$acc5,$acc5,$t1
567	umulh	$t1,$a1,$n0
568	adcs	$acc6,$acc6,$t2
569	umulh	$t2,$a2,$n0
570	adcs	$acc7,$acc7,$t3
571	umulh	$t3,$a3,$n0
572	adc	$carry,$carry,xzr
573	str	$acc0,[$tp],#8
574	adds	$acc0,$acc1,$t0
575	umulh	$t0,$a4,$n0
576	adcs	$acc1,$acc2,$t1
577	umulh	$t1,$a5,$n0
578	adcs	$acc2,$acc3,$t2
579	umulh	$t2,$a6,$n0
580	adcs	$acc3,$acc4,$t3
581	umulh	$t3,$a7,$n0
582	ldr	$n0,[$rp,$cnt]
583	adcs	$acc4,$acc5,$t0
584	adcs	$acc5,$acc6,$t1
585	adcs	$acc6,$acc7,$t2
586	adcs	$acc7,$carry,$t3
587	//adc	$carry,xzr,xzr		// moved above
588	cbnz	$cnt,.Lsqr8x_mul
589					// note that carry flag is guaranteed
590					// to be zero at this point
591	cmp	$ap,$ap_end		// done yet?
592	b.eq	.Lsqr8x_break
593
594	ldp	$a0,$a1,[$tp,#8*0]
595	ldp	$a2,$a3,[$tp,#8*2]
596	ldp	$a4,$a5,[$tp,#8*4]
597	ldp	$a6,$a7,[$tp,#8*6]
598	adds	$acc0,$acc0,$a0
599	ldr	$n0,[$rp,#-8*8]
600	adcs	$acc1,$acc1,$a1
601	ldp	$a0,$a1,[$ap,#8*0]
602	adcs	$acc2,$acc2,$a2
603	adcs	$acc3,$acc3,$a3
604	ldp	$a2,$a3,[$ap,#8*2]
605	adcs	$acc4,$acc4,$a4
606	adcs	$acc5,$acc5,$a5
607	ldp	$a4,$a5,[$ap,#8*4]
608	adcs	$acc6,$acc6,$a6
609	mov	$cnt,#-8*8
610	adcs	$acc7,$acc7,$a7
611	ldp	$a6,$a7,[$ap,#8*6]
612	add	$ap,$ap,#8*8
613	//adc	$carry,xzr,xzr		// moved above
614	b	.Lsqr8x_mul
615
616.align	4
617.Lsqr8x_break:
618	ldp	$a0,$a1,[$rp,#8*0]
619	add	$ap,$rp,#8*8
620	ldp	$a2,$a3,[$rp,#8*2]
621	sub	$t0,$ap_end,$ap		// is it last iteration?
622	ldp	$a4,$a5,[$rp,#8*4]
623	sub	$t1,$tp,$t0
624	ldp	$a6,$a7,[$rp,#8*6]
625	cbz	$t0,.Lsqr8x_outer_loop
626
627	stp	$acc0,$acc1,[$tp,#8*0]
628	ldp	$acc0,$acc1,[$t1,#8*0]
629	stp	$acc2,$acc3,[$tp,#8*2]
630	ldp	$acc2,$acc3,[$t1,#8*2]
631	stp	$acc4,$acc5,[$tp,#8*4]
632	ldp	$acc4,$acc5,[$t1,#8*4]
633	stp	$acc6,$acc7,[$tp,#8*6]
634	mov	$tp,$t1
635	ldp	$acc6,$acc7,[$t1,#8*6]
636	b	.Lsqr8x_outer_loop
637
638.align	4
639.Lsqr8x_outer_break:
640	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
641	ldp	$a1,$a3,[$t0,#8*0]	// recall that $t0 is &a[0]
642	ldp	$t1,$t2,[sp,#8*1]
643	ldp	$a5,$a7,[$t0,#8*2]
644	add	$ap,$t0,#8*4
645	ldp	$t3,$t0,[sp,#8*3]
646
647	stp	$acc0,$acc1,[$tp,#8*0]
648	mul	$acc0,$a1,$a1
649	stp	$acc2,$acc3,[$tp,#8*2]
650	umulh	$a1,$a1,$a1
651	stp	$acc4,$acc5,[$tp,#8*4]
652	mul	$a2,$a3,$a3
653	stp	$acc6,$acc7,[$tp,#8*6]
654	mov	$tp,sp
655	umulh	$a3,$a3,$a3
656	adds	$acc1,$a1,$t1,lsl#1
657	extr	$t1,$t2,$t1,#63
658	sub	$cnt,$num,#8*4
659
660.Lsqr4x_shift_n_add:
661	adcs	$acc2,$a2,$t1
662	extr	$t2,$t3,$t2,#63
663	sub	$cnt,$cnt,#8*4
664	adcs	$acc3,$a3,$t2
665	ldp	$t1,$t2,[$tp,#8*5]
666	mul	$a4,$a5,$a5
667	ldp	$a1,$a3,[$ap],#8*2
668	umulh	$a5,$a5,$a5
669	mul	$a6,$a7,$a7
670	umulh	$a7,$a7,$a7
671	extr	$t3,$t0,$t3,#63
672	stp	$acc0,$acc1,[$tp,#8*0]
673	adcs	$acc4,$a4,$t3
674	extr	$t0,$t1,$t0,#63
675	stp	$acc2,$acc3,[$tp,#8*2]
676	adcs	$acc5,$a5,$t0
677	ldp	$t3,$t0,[$tp,#8*7]
678	extr	$t1,$t2,$t1,#63
679	adcs	$acc6,$a6,$t1
680	extr	$t2,$t3,$t2,#63
681	adcs	$acc7,$a7,$t2
682	ldp	$t1,$t2,[$tp,#8*9]
683	mul	$a0,$a1,$a1
684	ldp	$a5,$a7,[$ap],#8*2
685	umulh	$a1,$a1,$a1
686	mul	$a2,$a3,$a3
687	umulh	$a3,$a3,$a3
688	stp	$acc4,$acc5,[$tp,#8*4]
689	extr	$t3,$t0,$t3,#63
690	stp	$acc6,$acc7,[$tp,#8*6]
691	add	$tp,$tp,#8*8
692	adcs	$acc0,$a0,$t3
693	extr	$t0,$t1,$t0,#63
694	adcs	$acc1,$a1,$t0
695	ldp	$t3,$t0,[$tp,#8*3]
696	extr	$t1,$t2,$t1,#63
697	cbnz	$cnt,.Lsqr4x_shift_n_add
698___
699my ($np,$np_end)=($ap,$ap_end);
700$code.=<<___;
701	 ldp	$np,$n0,[x29,#104]	// pull np and n0
702
703	adcs	$acc2,$a2,$t1
704	extr	$t2,$t3,$t2,#63
705	adcs	$acc3,$a3,$t2
706	ldp	$t1,$t2,[$tp,#8*5]
707	mul	$a4,$a5,$a5
708	umulh	$a5,$a5,$a5
709	stp	$acc0,$acc1,[$tp,#8*0]
710	mul	$a6,$a7,$a7
711	umulh	$a7,$a7,$a7
712	stp	$acc2,$acc3,[$tp,#8*2]
713	extr	$t3,$t0,$t3,#63
714	adcs	$acc4,$a4,$t3
715	extr	$t0,$t1,$t0,#63
716	 ldp	$acc0,$acc1,[sp,#8*0]
717	adcs	$acc5,$a5,$t0
718	extr	$t1,$t2,$t1,#63
719	 ldp	$a0,$a1,[$np,#8*0]
720	adcs	$acc6,$a6,$t1
721	extr	$t2,xzr,$t2,#63
722	 ldp	$a2,$a3,[$np,#8*2]
723	adc	$acc7,$a7,$t2
724	 ldp	$a4,$a5,[$np,#8*4]
725
726	// Reduce by 512 bits per iteration
727	mul	$na0,$n0,$acc0		// t[0]*n0
728	ldp	$a6,$a7,[$np,#8*6]
729	add	$np_end,$np,$num
730	ldp	$acc2,$acc3,[sp,#8*2]
731	stp	$acc4,$acc5,[$tp,#8*4]
732	ldp	$acc4,$acc5,[sp,#8*4]
733	stp	$acc6,$acc7,[$tp,#8*6]
734	ldp	$acc6,$acc7,[sp,#8*6]
735	add	$np,$np,#8*8
736	mov	$topmost,xzr		// initial top-most carry
737	mov	$tp,sp
738	mov	$cnt,#8
739
740.Lsqr8x_reduction:
741	// (*)	mul	$t0,$a0,$na0	// lo(n[0-7])*lo(t[0]*n0)
742	mul	$t1,$a1,$na0
743	sub	$cnt,$cnt,#1
744	mul	$t2,$a2,$na0
745	str	$na0,[$tp],#8		// put aside t[0]*n0 for tail processing
746	mul	$t3,$a3,$na0
747	// (*)	adds	xzr,$acc0,$t0
748	subs	xzr,$acc0,#1		// (*)
749	mul	$t0,$a4,$na0
750	adcs	$acc0,$acc1,$t1
751	mul	$t1,$a5,$na0
752	adcs	$acc1,$acc2,$t2
753	mul	$t2,$a6,$na0
754	adcs	$acc2,$acc3,$t3
755	mul	$t3,$a7,$na0
756	adcs	$acc3,$acc4,$t0
757	umulh	$t0,$a0,$na0		// hi(n[0-7])*lo(t[0]*n0)
758	adcs	$acc4,$acc5,$t1
759	umulh	$t1,$a1,$na0
760	adcs	$acc5,$acc6,$t2
761	umulh	$t2,$a2,$na0
762	adcs	$acc6,$acc7,$t3
763	umulh	$t3,$a3,$na0
764	adc	$acc7,xzr,xzr
765	adds	$acc0,$acc0,$t0
766	umulh	$t0,$a4,$na0
767	adcs	$acc1,$acc1,$t1
768	umulh	$t1,$a5,$na0
769	adcs	$acc2,$acc2,$t2
770	umulh	$t2,$a6,$na0
771	adcs	$acc3,$acc3,$t3
772	umulh	$t3,$a7,$na0
773	mul	$na0,$n0,$acc0		// next t[0]*n0
774	adcs	$acc4,$acc4,$t0
775	adcs	$acc5,$acc5,$t1
776	adcs	$acc6,$acc6,$t2
777	adc	$acc7,$acc7,$t3
778	cbnz	$cnt,.Lsqr8x_reduction
779
780	ldp	$t0,$t1,[$tp,#8*0]
781	ldp	$t2,$t3,[$tp,#8*2]
782	mov	$rp,$tp
783	sub	$cnt,$np_end,$np	// done yet?
784	adds	$acc0,$acc0,$t0
785	adcs	$acc1,$acc1,$t1
786	ldp	$t0,$t1,[$tp,#8*4]
787	adcs	$acc2,$acc2,$t2
788	adcs	$acc3,$acc3,$t3
789	ldp	$t2,$t3,[$tp,#8*6]
790	adcs	$acc4,$acc4,$t0
791	adcs	$acc5,$acc5,$t1
792	adcs	$acc6,$acc6,$t2
793	adcs	$acc7,$acc7,$t3
794	//adc	$carry,xzr,xzr		// moved below
795	cbz	$cnt,.Lsqr8x8_post_condition
796
797	ldr	$n0,[$tp,#-8*8]
798	ldp	$a0,$a1,[$np,#8*0]
799	ldp	$a2,$a3,[$np,#8*2]
800	ldp	$a4,$a5,[$np,#8*4]
801	mov	$cnt,#-8*8
802	ldp	$a6,$a7,[$np,#8*6]
803	add	$np,$np,#8*8
804
805.Lsqr8x_tail:
806	mul	$t0,$a0,$n0
807	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
808	mul	$t1,$a1,$n0
809	add	$cnt,$cnt,#8
810	mul	$t2,$a2,$n0
811	mul	$t3,$a3,$n0
812	adds	$acc0,$acc0,$t0
813	mul	$t0,$a4,$n0
814	adcs	$acc1,$acc1,$t1
815	mul	$t1,$a5,$n0
816	adcs	$acc2,$acc2,$t2
817	mul	$t2,$a6,$n0
818	adcs	$acc3,$acc3,$t3
819	mul	$t3,$a7,$n0
820	adcs	$acc4,$acc4,$t0
821	umulh	$t0,$a0,$n0
822	adcs	$acc5,$acc5,$t1
823	umulh	$t1,$a1,$n0
824	adcs	$acc6,$acc6,$t2
825	umulh	$t2,$a2,$n0
826	adcs	$acc7,$acc7,$t3
827	umulh	$t3,$a3,$n0
828	adc	$carry,$carry,xzr
829	str	$acc0,[$tp],#8
830	adds	$acc0,$acc1,$t0
831	umulh	$t0,$a4,$n0
832	adcs	$acc1,$acc2,$t1
833	umulh	$t1,$a5,$n0
834	adcs	$acc2,$acc3,$t2
835	umulh	$t2,$a6,$n0
836	adcs	$acc3,$acc4,$t3
837	umulh	$t3,$a7,$n0
838	ldr	$n0,[$rp,$cnt]
839	adcs	$acc4,$acc5,$t0
840	adcs	$acc5,$acc6,$t1
841	adcs	$acc6,$acc7,$t2
842	adcs	$acc7,$carry,$t3
843	//adc	$carry,xzr,xzr		// moved above
844	cbnz	$cnt,.Lsqr8x_tail
845					// note that carry flag is guaranteed
846					// to be zero at this point
847	ldp	$a0,$a1,[$tp,#8*0]
848	sub	$cnt,$np_end,$np	// done yet?
849	sub	$t2,$np_end,$num	// rewinded np
850	ldp	$a2,$a3,[$tp,#8*2]
851	ldp	$a4,$a5,[$tp,#8*4]
852	ldp	$a6,$a7,[$tp,#8*6]
853	cbz	$cnt,.Lsqr8x_tail_break
854
855	ldr	$n0,[$rp,#-8*8]
856	adds	$acc0,$acc0,$a0
857	adcs	$acc1,$acc1,$a1
858	ldp	$a0,$a1,[$np,#8*0]
859	adcs	$acc2,$acc2,$a2
860	adcs	$acc3,$acc3,$a3
861	ldp	$a2,$a3,[$np,#8*2]
862	adcs	$acc4,$acc4,$a4
863	adcs	$acc5,$acc5,$a5
864	ldp	$a4,$a5,[$np,#8*4]
865	adcs	$acc6,$acc6,$a6
866	mov	$cnt,#-8*8
867	adcs	$acc7,$acc7,$a7
868	ldp	$a6,$a7,[$np,#8*6]
869	add	$np,$np,#8*8
870	//adc	$carry,xzr,xzr		// moved above
871	b	.Lsqr8x_tail
872
873.align	4
874.Lsqr8x_tail_break:
875	ldr	$n0,[x29,#112]		// pull n0
876	add	$cnt,$tp,#8*8		// end of current t[num] window
877
878	subs	xzr,$topmost,#1		// "move" top-most carry to carry bit
879	adcs	$t0,$acc0,$a0
880	adcs	$t1,$acc1,$a1
881	ldp	$acc0,$acc1,[$rp,#8*0]
882	adcs	$acc2,$acc2,$a2
883	ldp	$a0,$a1,[$t2,#8*0]	// recall that $t2 is &n[0]
884	adcs	$acc3,$acc3,$a3
885	ldp	$a2,$a3,[$t2,#8*2]
886	adcs	$acc4,$acc4,$a4
887	adcs	$acc5,$acc5,$a5
888	ldp	$a4,$a5,[$t2,#8*4]
889	adcs	$acc6,$acc6,$a6
890	adcs	$acc7,$acc7,$a7
891	ldp	$a6,$a7,[$t2,#8*6]
892	add	$np,$t2,#8*8
893	adc	$topmost,xzr,xzr	// top-most carry
894	mul	$na0,$n0,$acc0
895	stp	$t0,$t1,[$tp,#8*0]
896	stp	$acc2,$acc3,[$tp,#8*2]
897	ldp	$acc2,$acc3,[$rp,#8*2]
898	stp	$acc4,$acc5,[$tp,#8*4]
899	ldp	$acc4,$acc5,[$rp,#8*4]
900	cmp	$cnt,x29		// did we hit the bottom?
901	stp	$acc6,$acc7,[$tp,#8*6]
902	mov	$tp,$rp			// slide the window
903	ldp	$acc6,$acc7,[$rp,#8*6]
904	mov	$cnt,#8
905	b.ne	.Lsqr8x_reduction
906
907	// Final step. We see if result is larger than modulus, and
908	// if it is, subtract the modulus. But comparison implies
909	// subtraction. So we subtract modulus, see if it borrowed,
910	// and conditionally copy original value.
911	ldr	$rp,[x29,#96]		// pull rp
912	add	$tp,$tp,#8*8
913	subs	$t0,$acc0,$a0
914	sbcs	$t1,$acc1,$a1
915	sub	$cnt,$num,#8*8
916	mov	$ap_end,$rp		// $rp copy
917
918.Lsqr8x_sub:
919	sbcs	$t2,$acc2,$a2
920	ldp	$a0,$a1,[$np,#8*0]
921	sbcs	$t3,$acc3,$a3
922	stp	$t0,$t1,[$rp,#8*0]
923	sbcs	$t0,$acc4,$a4
924	ldp	$a2,$a3,[$np,#8*2]
925	sbcs	$t1,$acc5,$a5
926	stp	$t2,$t3,[$rp,#8*2]
927	sbcs	$t2,$acc6,$a6
928	ldp	$a4,$a5,[$np,#8*4]
929	sbcs	$t3,$acc7,$a7
930	ldp	$a6,$a7,[$np,#8*6]
931	add	$np,$np,#8*8
932	ldp	$acc0,$acc1,[$tp,#8*0]
933	sub	$cnt,$cnt,#8*8
934	ldp	$acc2,$acc3,[$tp,#8*2]
935	ldp	$acc4,$acc5,[$tp,#8*4]
936	ldp	$acc6,$acc7,[$tp,#8*6]
937	add	$tp,$tp,#8*8
938	stp	$t0,$t1,[$rp,#8*4]
939	sbcs	$t0,$acc0,$a0
940	stp	$t2,$t3,[$rp,#8*6]
941	add	$rp,$rp,#8*8
942	sbcs	$t1,$acc1,$a1
943	cbnz	$cnt,.Lsqr8x_sub
944
945	sbcs	$t2,$acc2,$a2
946	 mov	$tp,sp
947	 add	$ap,sp,$num
948	 ldp	$a0,$a1,[$ap_end,#8*0]
949	sbcs	$t3,$acc3,$a3
950	stp	$t0,$t1,[$rp,#8*0]
951	sbcs	$t0,$acc4,$a4
952	 ldp	$a2,$a3,[$ap_end,#8*2]
953	sbcs	$t1,$acc5,$a5
954	stp	$t2,$t3,[$rp,#8*2]
955	sbcs	$t2,$acc6,$a6
956	 ldp	$acc0,$acc1,[$ap,#8*0]
957	sbcs	$t3,$acc7,$a7
958	 ldp	$acc2,$acc3,[$ap,#8*2]
959	sbcs	xzr,$topmost,xzr	// did it borrow?
960	ldr	x30,[x29,#8]		// pull return address
961	stp	$t0,$t1,[$rp,#8*4]
962	stp	$t2,$t3,[$rp,#8*6]
963
964	sub	$cnt,$num,#8*4
965.Lsqr4x_cond_copy:
966	sub	$cnt,$cnt,#8*4
967	csel	$t0,$acc0,$a0,lo
968	 stp	xzr,xzr,[$tp,#8*0]
969	csel	$t1,$acc1,$a1,lo
970	ldp	$a0,$a1,[$ap_end,#8*4]
971	ldp	$acc0,$acc1,[$ap,#8*4]
972	csel	$t2,$acc2,$a2,lo
973	 stp	xzr,xzr,[$tp,#8*2]
974	 add	$tp,$tp,#8*4
975	csel	$t3,$acc3,$a3,lo
976	ldp	$a2,$a3,[$ap_end,#8*6]
977	ldp	$acc2,$acc3,[$ap,#8*6]
978	add	$ap,$ap,#8*4
979	stp	$t0,$t1,[$ap_end,#8*0]
980	stp	$t2,$t3,[$ap_end,#8*2]
981	add	$ap_end,$ap_end,#8*4
982	 stp	xzr,xzr,[$ap,#8*0]
983	 stp	xzr,xzr,[$ap,#8*2]
984	cbnz	$cnt,.Lsqr4x_cond_copy
985
986	csel	$t0,$acc0,$a0,lo
987	 stp	xzr,xzr,[$tp,#8*0]
988	csel	$t1,$acc1,$a1,lo
989	 stp	xzr,xzr,[$tp,#8*2]
990	csel	$t2,$acc2,$a2,lo
991	csel	$t3,$acc3,$a3,lo
992	stp	$t0,$t1,[$ap_end,#8*0]
993	stp	$t2,$t3,[$ap_end,#8*2]
994
995	b	.Lsqr8x_done
996
997.align	4
998.Lsqr8x8_post_condition:
999	adc	$carry,xzr,xzr
1000	ldr	x30,[x29,#8]		// pull return address
1001	// $acc0-7,$carry hold result, $a0-7 hold modulus
1002	subs	$a0,$acc0,$a0
1003	ldr	$ap,[x29,#96]		// pull rp
1004	sbcs	$a1,$acc1,$a1
1005	 stp	xzr,xzr,[sp,#8*0]
1006	sbcs	$a2,$acc2,$a2
1007	 stp	xzr,xzr,[sp,#8*2]
1008	sbcs	$a3,$acc3,$a3
1009	 stp	xzr,xzr,[sp,#8*4]
1010	sbcs	$a4,$acc4,$a4
1011	 stp	xzr,xzr,[sp,#8*6]
1012	sbcs	$a5,$acc5,$a5
1013	 stp	xzr,xzr,[sp,#8*8]
1014	sbcs	$a6,$acc6,$a6
1015	 stp	xzr,xzr,[sp,#8*10]
1016	sbcs	$a7,$acc7,$a7
1017	 stp	xzr,xzr,[sp,#8*12]
1018	sbcs	$carry,$carry,xzr	// did it borrow?
1019	 stp	xzr,xzr,[sp,#8*14]
1020
1021	// $a0-7 hold result-modulus
1022	csel	$a0,$acc0,$a0,lo
1023	csel	$a1,$acc1,$a1,lo
1024	csel	$a2,$acc2,$a2,lo
1025	csel	$a3,$acc3,$a3,lo
1026	stp	$a0,$a1,[$ap,#8*0]
1027	csel	$a4,$acc4,$a4,lo
1028	csel	$a5,$acc5,$a5,lo
1029	stp	$a2,$a3,[$ap,#8*2]
1030	csel	$a6,$acc6,$a6,lo
1031	csel	$a7,$acc7,$a7,lo
1032	stp	$a4,$a5,[$ap,#8*4]
1033	stp	$a6,$a7,[$ap,#8*6]
1034
1035.Lsqr8x_done:
1036	ldp	x19,x20,[x29,#16]
1037	mov	sp,x29
1038	ldp	x21,x22,[x29,#32]
1039	mov	x0,#1
1040	ldp	x23,x24,[x29,#48]
1041	ldp	x25,x26,[x29,#64]
1042	ldp	x27,x28,[x29,#80]
1043	ldr	x29,[sp],#128
1044	.inst	0xd50323bf		// autiasp
1045	ret
1046.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1047___
1048}
1049
1050{
1051########################################################################
1052# Even though this might look as ARMv8 adaptation of mulx4x_mont from
1053# x86_64-mont5 module, it's different in sense that it performs
1054# reduction 256 bits at a time.
1055
1056my ($a0,$a1,$a2,$a3,
1057    $t0,$t1,$t2,$t3,
1058    $m0,$m1,$m2,$m3,
1059    $acc0,$acc1,$acc2,$acc3,$acc4,
1060    $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1061my  $bp_end=$rp;
1062my  ($carry,$topmost) = ($rp,"x30");
1063
1064$code.=<<___;
1065.type	__bn_mul4x_mont,%function
1066.align	5
1067__bn_mul4x_mont:
1068	.inst	0xd503233f		// paciasp
1069	stp	x29,x30,[sp,#-128]!
1070	add	x29,sp,#0
1071	stp	x19,x20,[sp,#16]
1072	stp	x21,x22,[sp,#32]
1073	stp	x23,x24,[sp,#48]
1074	stp	x25,x26,[sp,#64]
1075	stp	x27,x28,[sp,#80]
1076
1077	sub	$tp,sp,$num,lsl#3
1078	lsl	$num,$num,#3
1079	ldr	$n0,[$n0]		// *n0
1080	sub	sp,$tp,#8*4		// alloca
1081
1082	add	$t0,$bp,$num
1083	add	$ap_end,$ap,$num
1084	stp	$rp,$t0,[x29,#96]	// offload rp and &b[num]
1085
1086	ldr	$bi,[$bp,#8*0]		// b[0]
1087	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1088	ldp	$a2,$a3,[$ap,#8*2]
1089	add	$ap,$ap,#8*4
1090	mov	$acc0,xzr
1091	mov	$acc1,xzr
1092	mov	$acc2,xzr
1093	mov	$acc3,xzr
1094	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1095	ldp	$m2,$m3,[$np,#8*2]
1096	adds	$np,$np,#8*4		// clear carry bit
1097	mov	$carry,xzr
1098	mov	$cnt,#0
1099	mov	$tp,sp
1100
1101.Loop_mul4x_1st_reduction:
1102	mul	$t0,$a0,$bi		// lo(a[0..3]*b[0])
1103	adc	$carry,$carry,xzr	// modulo-scheduled
1104	mul	$t1,$a1,$bi
1105	add	$cnt,$cnt,#8
1106	mul	$t2,$a2,$bi
1107	and	$cnt,$cnt,#31
1108	mul	$t3,$a3,$bi
1109	adds	$acc0,$acc0,$t0
1110	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[0])
1111	adcs	$acc1,$acc1,$t1
1112	mul	$mi,$acc0,$n0		// t[0]*n0
1113	adcs	$acc2,$acc2,$t2
1114	umulh	$t1,$a1,$bi
1115	adcs	$acc3,$acc3,$t3
1116	umulh	$t2,$a2,$bi
1117	adc	$acc4,xzr,xzr
1118	umulh	$t3,$a3,$bi
1119	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1120	adds	$acc1,$acc1,$t0
1121	// (*)	mul	$t0,$m0,$mi	// lo(n[0..3]*t[0]*n0)
1122	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1123	adcs	$acc2,$acc2,$t1
1124	mul	$t1,$m1,$mi
1125	adcs	$acc3,$acc3,$t2
1126	mul	$t2,$m2,$mi
1127	adc	$acc4,$acc4,$t3		// can't overflow
1128	mul	$t3,$m3,$mi
1129	// (*)	adds	xzr,$acc0,$t0
1130	subs	xzr,$acc0,#1		// (*)
1131	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0)
1132	adcs	$acc0,$acc1,$t1
1133	umulh	$t1,$m1,$mi
1134	adcs	$acc1,$acc2,$t2
1135	umulh	$t2,$m2,$mi
1136	adcs	$acc2,$acc3,$t3
1137	umulh	$t3,$m3,$mi
1138	adcs	$acc3,$acc4,$carry
1139	adc	$carry,xzr,xzr
1140	adds	$acc0,$acc0,$t0
1141	sub	$t0,$ap_end,$ap
1142	adcs	$acc1,$acc1,$t1
1143	adcs	$acc2,$acc2,$t2
1144	adcs	$acc3,$acc3,$t3
1145	//adc	$carry,$carry,xzr
1146	cbnz	$cnt,.Loop_mul4x_1st_reduction
1147
1148	cbz	$t0,.Lmul4x4_post_condition
1149
1150	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1151	ldp	$a2,$a3,[$ap,#8*2]
1152	add	$ap,$ap,#8*4
1153	ldr	$mi,[sp]		// a[0]*n0
1154	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1155	ldp	$m2,$m3,[$np,#8*2]
1156	add	$np,$np,#8*4
1157
1158.Loop_mul4x_1st_tail:
1159	mul	$t0,$a0,$bi		// lo(a[4..7]*b[i])
1160	adc	$carry,$carry,xzr	// modulo-scheduled
1161	mul	$t1,$a1,$bi
1162	add	$cnt,$cnt,#8
1163	mul	$t2,$a2,$bi
1164	and	$cnt,$cnt,#31
1165	mul	$t3,$a3,$bi
1166	adds	$acc0,$acc0,$t0
1167	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[i])
1168	adcs	$acc1,$acc1,$t1
1169	umulh	$t1,$a1,$bi
1170	adcs	$acc2,$acc2,$t2
1171	umulh	$t2,$a2,$bi
1172	adcs	$acc3,$acc3,$t3
1173	umulh	$t3,$a3,$bi
1174	adc	$acc4,xzr,xzr
1175	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1176	adds	$acc1,$acc1,$t0
1177	mul	$t0,$m0,$mi		// lo(n[4..7]*a[0]*n0)
1178	adcs	$acc2,$acc2,$t1
1179	mul	$t1,$m1,$mi
1180	adcs	$acc3,$acc3,$t2
1181	mul	$t2,$m2,$mi
1182	adc	$acc4,$acc4,$t3		// can't overflow
1183	mul	$t3,$m3,$mi
1184	adds	$acc0,$acc0,$t0
1185	umulh	$t0,$m0,$mi		// hi(n[4..7]*a[0]*n0)
1186	adcs	$acc1,$acc1,$t1
1187	umulh	$t1,$m1,$mi
1188	adcs	$acc2,$acc2,$t2
1189	umulh	$t2,$m2,$mi
1190	adcs	$acc3,$acc3,$t3
1191	adcs	$acc4,$acc4,$carry
1192	umulh	$t3,$m3,$mi
1193	adc	$carry,xzr,xzr
1194	ldr	$mi,[sp,$cnt]		// next t[0]*n0
1195	str	$acc0,[$tp],#8		// result!!!
1196	adds	$acc0,$acc1,$t0
1197	sub	$t0,$ap_end,$ap		// done yet?
1198	adcs	$acc1,$acc2,$t1
1199	adcs	$acc2,$acc3,$t2
1200	adcs	$acc3,$acc4,$t3
1201	//adc	$carry,$carry,xzr
1202	cbnz	$cnt,.Loop_mul4x_1st_tail
1203
1204	sub	$t1,$ap_end,$num	// rewinded $ap
1205	cbz	$t0,.Lmul4x_proceed
1206
1207	ldp	$a0,$a1,[$ap,#8*0]
1208	ldp	$a2,$a3,[$ap,#8*2]
1209	add	$ap,$ap,#8*4
1210	ldp	$m0,$m1,[$np,#8*0]
1211	ldp	$m2,$m3,[$np,#8*2]
1212	add	$np,$np,#8*4
1213	b	.Loop_mul4x_1st_tail
1214
1215.align	5
1216.Lmul4x_proceed:
1217	ldr	$bi,[$bp,#8*4]!		// *++b
1218	adc	$topmost,$carry,xzr
1219	ldp	$a0,$a1,[$t1,#8*0]	// a[0..3]
1220	sub	$np,$np,$num		// rewind np
1221	ldp	$a2,$a3,[$t1,#8*2]
1222	add	$ap,$t1,#8*4
1223
1224	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1225	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1226	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1227	ldp	$acc2,$acc3,[sp,#8*6]
1228
1229	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1230	mov	$tp,sp
1231	ldp	$m2,$m3,[$np,#8*2]
1232	adds	$np,$np,#8*4		// clear carry bit
1233	mov	$carry,xzr
1234
1235.align	4
1236.Loop_mul4x_reduction:
1237	mul	$t0,$a0,$bi		// lo(a[0..3]*b[4])
1238	adc	$carry,$carry,xzr	// modulo-scheduled
1239	mul	$t1,$a1,$bi
1240	add	$cnt,$cnt,#8
1241	mul	$t2,$a2,$bi
1242	and	$cnt,$cnt,#31
1243	mul	$t3,$a3,$bi
1244	adds	$acc0,$acc0,$t0
1245	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[4])
1246	adcs	$acc1,$acc1,$t1
1247	mul	$mi,$acc0,$n0		// t[0]*n0
1248	adcs	$acc2,$acc2,$t2
1249	umulh	$t1,$a1,$bi
1250	adcs	$acc3,$acc3,$t3
1251	umulh	$t2,$a2,$bi
1252	adc	$acc4,xzr,xzr
1253	umulh	$t3,$a3,$bi
1254	ldr	$bi,[$bp,$cnt]		// next b[i]
1255	adds	$acc1,$acc1,$t0
1256	// (*)	mul	$t0,$m0,$mi
1257	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1258	adcs	$acc2,$acc2,$t1
1259	mul	$t1,$m1,$mi		// lo(n[0..3]*t[0]*n0
1260	adcs	$acc3,$acc3,$t2
1261	mul	$t2,$m2,$mi
1262	adc	$acc4,$acc4,$t3		// can't overflow
1263	mul	$t3,$m3,$mi
1264	// (*)	adds	xzr,$acc0,$t0
1265	subs	xzr,$acc0,#1		// (*)
1266	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0
1267	adcs	$acc0,$acc1,$t1
1268	umulh	$t1,$m1,$mi
1269	adcs	$acc1,$acc2,$t2
1270	umulh	$t2,$m2,$mi
1271	adcs	$acc2,$acc3,$t3
1272	umulh	$t3,$m3,$mi
1273	adcs	$acc3,$acc4,$carry
1274	adc	$carry,xzr,xzr
1275	adds	$acc0,$acc0,$t0
1276	adcs	$acc1,$acc1,$t1
1277	adcs	$acc2,$acc2,$t2
1278	adcs	$acc3,$acc3,$t3
1279	//adc	$carry,$carry,xzr
1280	cbnz	$cnt,.Loop_mul4x_reduction
1281
1282	adc	$carry,$carry,xzr
1283	ldp	$t0,$t1,[$tp,#8*4]	// t[4..7]
1284	ldp	$t2,$t3,[$tp,#8*6]
1285	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1286	ldp	$a2,$a3,[$ap,#8*2]
1287	add	$ap,$ap,#8*4
1288	adds	$acc0,$acc0,$t0
1289	adcs	$acc1,$acc1,$t1
1290	adcs	$acc2,$acc2,$t2
1291	adcs	$acc3,$acc3,$t3
1292	//adc	$carry,$carry,xzr
1293
1294	ldr	$mi,[sp]		// t[0]*n0
1295	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1296	ldp	$m2,$m3,[$np,#8*2]
1297	add	$np,$np,#8*4
1298
1299.align	4
1300.Loop_mul4x_tail:
1301	mul	$t0,$a0,$bi		// lo(a[4..7]*b[4])
1302	adc	$carry,$carry,xzr	// modulo-scheduled
1303	mul	$t1,$a1,$bi
1304	add	$cnt,$cnt,#8
1305	mul	$t2,$a2,$bi
1306	and	$cnt,$cnt,#31
1307	mul	$t3,$a3,$bi
1308	adds	$acc0,$acc0,$t0
1309	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[4])
1310	adcs	$acc1,$acc1,$t1
1311	umulh	$t1,$a1,$bi
1312	adcs	$acc2,$acc2,$t2
1313	umulh	$t2,$a2,$bi
1314	adcs	$acc3,$acc3,$t3
1315	umulh	$t3,$a3,$bi
1316	adc	$acc4,xzr,xzr
1317	ldr	$bi,[$bp,$cnt]		// next b[i]
1318	adds	$acc1,$acc1,$t0
1319	mul	$t0,$m0,$mi		// lo(n[4..7]*t[0]*n0)
1320	adcs	$acc2,$acc2,$t1
1321	mul	$t1,$m1,$mi
1322	adcs	$acc3,$acc3,$t2
1323	mul	$t2,$m2,$mi
1324	adc	$acc4,$acc4,$t3		// can't overflow
1325	mul	$t3,$m3,$mi
1326	adds	$acc0,$acc0,$t0
1327	umulh	$t0,$m0,$mi		// hi(n[4..7]*t[0]*n0)
1328	adcs	$acc1,$acc1,$t1
1329	umulh	$t1,$m1,$mi
1330	adcs	$acc2,$acc2,$t2
1331	umulh	$t2,$m2,$mi
1332	adcs	$acc3,$acc3,$t3
1333	umulh	$t3,$m3,$mi
1334	adcs	$acc4,$acc4,$carry
1335	ldr	$mi,[sp,$cnt]		// next a[0]*n0
1336	adc	$carry,xzr,xzr
1337	str	$acc0,[$tp],#8		// result!!!
1338	adds	$acc0,$acc1,$t0
1339	sub	$t0,$ap_end,$ap		// done yet?
1340	adcs	$acc1,$acc2,$t1
1341	adcs	$acc2,$acc3,$t2
1342	adcs	$acc3,$acc4,$t3
1343	//adc	$carry,$carry,xzr
1344	cbnz	$cnt,.Loop_mul4x_tail
1345
1346	sub	$t1,$np,$num		// rewinded np?
1347	adc	$carry,$carry,xzr
1348	cbz	$t0,.Loop_mul4x_break
1349
1350	ldp	$t0,$t1,[$tp,#8*4]
1351	ldp	$t2,$t3,[$tp,#8*6]
1352	ldp	$a0,$a1,[$ap,#8*0]
1353	ldp	$a2,$a3,[$ap,#8*2]
1354	add	$ap,$ap,#8*4
1355	adds	$acc0,$acc0,$t0
1356	adcs	$acc1,$acc1,$t1
1357	adcs	$acc2,$acc2,$t2
1358	adcs	$acc3,$acc3,$t3
1359	//adc	$carry,$carry,xzr
1360	ldp	$m0,$m1,[$np,#8*0]
1361	ldp	$m2,$m3,[$np,#8*2]
1362	add	$np,$np,#8*4
1363	b	.Loop_mul4x_tail
1364
1365.align	4
1366.Loop_mul4x_break:
1367	ldp	$t2,$t3,[x29,#96]	// pull rp and &b[num]
1368	adds	$acc0,$acc0,$topmost
1369	add	$bp,$bp,#8*4		// bp++
1370	adcs	$acc1,$acc1,xzr
1371	sub	$ap,$ap,$num		// rewind ap
1372	adcs	$acc2,$acc2,xzr
1373	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1374	adcs	$acc3,$acc3,xzr
1375	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1376	adc	$topmost,$carry,xzr
1377	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1378	cmp	$bp,$t3			// done yet?
1379	ldp	$acc2,$acc3,[sp,#8*6]
1380	ldp	$m0,$m1,[$t1,#8*0]	// n[0..3]
1381	ldp	$m2,$m3,[$t1,#8*2]
1382	add	$np,$t1,#8*4
1383	b.eq	.Lmul4x_post
1384
1385	ldr	$bi,[$bp]
1386	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1387	ldp	$a2,$a3,[$ap,#8*2]
1388	adds	$ap,$ap,#8*4		// clear carry bit
1389	mov	$carry,xzr
1390	mov	$tp,sp
1391	b	.Loop_mul4x_reduction
1392
1393.align	4
1394.Lmul4x_post:
1395	// Final step. We see if result is larger than modulus, and
1396	// if it is, subtract the modulus. But comparison implies
1397	// subtraction. So we subtract modulus, see if it borrowed,
1398	// and conditionally copy original value.
1399	mov	$rp,$t2
1400	mov	$ap_end,$t2		// $rp copy
1401	subs	$t0,$acc0,$m0
1402	add	$tp,sp,#8*8
1403	sbcs	$t1,$acc1,$m1
1404	sub	$cnt,$num,#8*4
1405
1406.Lmul4x_sub:
1407	sbcs	$t2,$acc2,$m2
1408	ldp	$m0,$m1,[$np,#8*0]
1409	sub	$cnt,$cnt,#8*4
1410	ldp	$acc0,$acc1,[$tp,#8*0]
1411	sbcs	$t3,$acc3,$m3
1412	ldp	$m2,$m3,[$np,#8*2]
1413	add	$np,$np,#8*4
1414	ldp	$acc2,$acc3,[$tp,#8*2]
1415	add	$tp,$tp,#8*4
1416	stp	$t0,$t1,[$rp,#8*0]
1417	sbcs	$t0,$acc0,$m0
1418	stp	$t2,$t3,[$rp,#8*2]
1419	add	$rp,$rp,#8*4
1420	sbcs	$t1,$acc1,$m1
1421	cbnz	$cnt,.Lmul4x_sub
1422
1423	sbcs	$t2,$acc2,$m2
1424	 mov	$tp,sp
1425	 add	$ap,sp,#8*4
1426	 ldp	$a0,$a1,[$ap_end,#8*0]
1427	sbcs	$t3,$acc3,$m3
1428	stp	$t0,$t1,[$rp,#8*0]
1429	 ldp	$a2,$a3,[$ap_end,#8*2]
1430	stp	$t2,$t3,[$rp,#8*2]
1431	 ldp	$acc0,$acc1,[$ap,#8*0]
1432	 ldp	$acc2,$acc3,[$ap,#8*2]
1433	sbcs	xzr,$topmost,xzr	// did it borrow?
1434	ldr	x30,[x29,#8]		// pull return address
1435
1436	sub	$cnt,$num,#8*4
1437.Lmul4x_cond_copy:
1438	sub	$cnt,$cnt,#8*4
1439	csel	$t0,$acc0,$a0,lo
1440	 stp	xzr,xzr,[$tp,#8*0]
1441	csel	$t1,$acc1,$a1,lo
1442	ldp	$a0,$a1,[$ap_end,#8*4]
1443	ldp	$acc0,$acc1,[$ap,#8*4]
1444	csel	$t2,$acc2,$a2,lo
1445	 stp	xzr,xzr,[$tp,#8*2]
1446	 add	$tp,$tp,#8*4
1447	csel	$t3,$acc3,$a3,lo
1448	ldp	$a2,$a3,[$ap_end,#8*6]
1449	ldp	$acc2,$acc3,[$ap,#8*6]
1450	add	$ap,$ap,#8*4
1451	stp	$t0,$t1,[$ap_end,#8*0]
1452	stp	$t2,$t3,[$ap_end,#8*2]
1453	add	$ap_end,$ap_end,#8*4
1454	cbnz	$cnt,.Lmul4x_cond_copy
1455
1456	csel	$t0,$acc0,$a0,lo
1457	 stp	xzr,xzr,[$tp,#8*0]
1458	csel	$t1,$acc1,$a1,lo
1459	 stp	xzr,xzr,[$tp,#8*2]
1460	csel	$t2,$acc2,$a2,lo
1461	 stp	xzr,xzr,[$tp,#8*3]
1462	csel	$t3,$acc3,$a3,lo
1463	 stp	xzr,xzr,[$tp,#8*4]
1464	stp	$t0,$t1,[$ap_end,#8*0]
1465	stp	$t2,$t3,[$ap_end,#8*2]
1466
1467	b	.Lmul4x_done
1468
1469.align	4
1470.Lmul4x4_post_condition:
1471	adc	$carry,$carry,xzr
1472	ldr	$ap,[x29,#96]		// pull rp
1473	// $acc0-3,$carry hold result, $m0-7 hold modulus
1474	subs	$a0,$acc0,$m0
1475	ldr	x30,[x29,#8]		// pull return address
1476	sbcs	$a1,$acc1,$m1
1477	 stp	xzr,xzr,[sp,#8*0]
1478	sbcs	$a2,$acc2,$m2
1479	 stp	xzr,xzr,[sp,#8*2]
1480	sbcs	$a3,$acc3,$m3
1481	 stp	xzr,xzr,[sp,#8*4]
1482	sbcs	xzr,$carry,xzr		// did it borrow?
1483	 stp	xzr,xzr,[sp,#8*6]
1484
1485	// $a0-3 hold result-modulus
1486	csel	$a0,$acc0,$a0,lo
1487	csel	$a1,$acc1,$a1,lo
1488	csel	$a2,$acc2,$a2,lo
1489	csel	$a3,$acc3,$a3,lo
1490	stp	$a0,$a1,[$ap,#8*0]
1491	stp	$a2,$a3,[$ap,#8*2]
1492
1493.Lmul4x_done:
1494	ldp	x19,x20,[x29,#16]
1495	mov	sp,x29
1496	ldp	x21,x22,[x29,#32]
1497	mov	x0,#1
1498	ldp	x23,x24,[x29,#48]
1499	ldp	x25,x26,[x29,#64]
1500	ldp	x27,x28,[x29,#80]
1501	ldr	x29,[sp],#128
1502	.inst	0xd50323bf		// autiasp
1503	ret
1504.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1505___
1506}
1507$code.=<<___;
1508.asciz	"Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1509.align	4
1510___
1511
1512print $code;
1513
1514close STDOUT or die "error closing STDOUT: $!";
1515