1#! /usr/bin/env perl
2# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2015
18#
19# "Teaser" Montgomery multiplication module for ARMv8. Needs more
20# work. While it does improve RSA sign performance by 20-30% (less for
21# longer keys) on most processors, for some reason RSA2048 is not
22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
23# instruction issue rate is limited on processor in question, meaning
24# that dedicated squaring procedure is a must. Well, actually all
25# contemporary AArch64 processors seem to have limited multiplication
26# issue rate, i.e. they can't issue multiplication every cycle, which
27# explains moderate improvement coefficients in comparison to
28# compiler-generated code. Recall that compiler is instructed to use
29# umulh and therefore uses same amount of multiplication instructions
30# to do the job. Assembly's edge is to minimize number of "collateral"
31# instructions and of course instruction scheduling.
32#
33# April 2015
34#
35# Squaring procedure that handles lengths divisible by 8 improves
36# RSA/DSA performance by 25-40-60% depending on processor and key
37# length. Overall improvement coefficients are always positive in
38# comparison to compiler-generated code. On Cortex-A57 improvement
39# is still modest on longest key lengths, while others exhibit e.g.
40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
41# on Cortex-A57 and ~60-100% faster on others.
42
43$flavour = shift;
44$output  = shift;
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49die "can't locate arm-xlate.pl";
50
51open OUT,"| \"$^X\" $xlate $flavour $output";
52*STDOUT=*OUT;
53
54($lo0,$hi0,$aj,$m0,$alo,$ahi,
55 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
56 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
57
58# int bn_mul_mont(
59$rp="x0";	# BN_ULONG *rp,
60$ap="x1";	# const BN_ULONG *ap,
61$bp="x2";	# const BN_ULONG *bp,
62$np="x3";	# const BN_ULONG *np,
63$n0="x4";	# const BN_ULONG *n0,
64$num="x5";	# int num);
65
66$code.=<<___;
67.text
68
69.globl	bn_mul_mont
70.type	bn_mul_mont,%function
71.align	5
72bn_mul_mont:
73	tst	$num,#7
74	b.eq	__bn_sqr8x_mont
75	tst	$num,#3
76	b.eq	__bn_mul4x_mont
77.Lmul_mont:
78	stp	x29,x30,[sp,#-64]!
79	add	x29,sp,#0
80	stp	x19,x20,[sp,#16]
81	stp	x21,x22,[sp,#32]
82	stp	x23,x24,[sp,#48]
83
84	ldr	$m0,[$bp],#8		// bp[0]
85	sub	$tp,sp,$num,lsl#3
86	ldp	$hi0,$aj,[$ap],#16	// ap[0..1]
87	lsl	$num,$num,#3
88	ldr	$n0,[$n0]		// *n0
89	and	$tp,$tp,#-16		// ABI says so
90	ldp	$hi1,$nj,[$np],#16	// np[0..1]
91
92	mul	$lo0,$hi0,$m0		// ap[0]*bp[0]
93	sub	$j,$num,#16		// j=num-2
94	umulh	$hi0,$hi0,$m0
95	mul	$alo,$aj,$m0		// ap[1]*bp[0]
96	umulh	$ahi,$aj,$m0
97
98	mul	$m1,$lo0,$n0		// "tp[0]"*n0
99	mov	sp,$tp			// alloca
100
101	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
102	umulh	$hi1,$hi1,$m1
103	mul	$nlo,$nj,$m1		// np[1]*m1
104	// (*)	adds	$lo1,$lo1,$lo0	// discarded
105	// (*)	As for removal of first multiplication and addition
106	//	instructions. The outcome of first addition is
107	//	guaranteed to be zero, which leaves two computationally
108	//	significant outcomes: it either carries or not. Then
109	//	question is when does it carry? Is there alternative
110	//	way to deduce it? If you follow operations, you can
111	//	observe that condition for carry is quite simple:
112	//	$lo0 being non-zero. So that carry can be calculated
113	//	by adding -1 to $lo0. That's what next instruction does.
114	subs	xzr,$lo0,#1		// (*)
115	umulh	$nhi,$nj,$m1
116	adc	$hi1,$hi1,xzr
117	cbz	$j,.L1st_skip
118
119.L1st:
120	ldr	$aj,[$ap],#8
121	adds	$lo0,$alo,$hi0
122	sub	$j,$j,#8		// j--
123	adc	$hi0,$ahi,xzr
124
125	ldr	$nj,[$np],#8
126	adds	$lo1,$nlo,$hi1
127	mul	$alo,$aj,$m0		// ap[j]*bp[0]
128	adc	$hi1,$nhi,xzr
129	umulh	$ahi,$aj,$m0
130
131	adds	$lo1,$lo1,$lo0
132	mul	$nlo,$nj,$m1		// np[j]*m1
133	adc	$hi1,$hi1,xzr
134	umulh	$nhi,$nj,$m1
135	str	$lo1,[$tp],#8		// tp[j-1]
136	cbnz	$j,.L1st
137
138.L1st_skip:
139	adds	$lo0,$alo,$hi0
140	sub	$ap,$ap,$num		// rewind $ap
141	adc	$hi0,$ahi,xzr
142
143	adds	$lo1,$nlo,$hi1
144	sub	$np,$np,$num		// rewind $np
145	adc	$hi1,$nhi,xzr
146
147	adds	$lo1,$lo1,$lo0
148	sub	$i,$num,#8		// i=num-1
149	adcs	$hi1,$hi1,$hi0
150
151	adc	$ovf,xzr,xzr		// upmost overflow bit
152	stp	$lo1,$hi1,[$tp]
153
154.Louter:
155	ldr	$m0,[$bp],#8		// bp[i]
156	ldp	$hi0,$aj,[$ap],#16
157	ldr	$tj,[sp]		// tp[0]
158	add	$tp,sp,#8
159
160	mul	$lo0,$hi0,$m0		// ap[0]*bp[i]
161	sub	$j,$num,#16		// j=num-2
162	umulh	$hi0,$hi0,$m0
163	ldp	$hi1,$nj,[$np],#16
164	mul	$alo,$aj,$m0		// ap[1]*bp[i]
165	adds	$lo0,$lo0,$tj
166	umulh	$ahi,$aj,$m0
167	adc	$hi0,$hi0,xzr
168
169	mul	$m1,$lo0,$n0
170	sub	$i,$i,#8		// i--
171
172	// (*)	mul	$lo1,$hi1,$m1	// np[0]*m1
173	umulh	$hi1,$hi1,$m1
174	mul	$nlo,$nj,$m1		// np[1]*m1
175	// (*)	adds	$lo1,$lo1,$lo0
176	subs	xzr,$lo0,#1		// (*)
177	umulh	$nhi,$nj,$m1
178	cbz	$j,.Linner_skip
179
180.Linner:
181	ldr	$aj,[$ap],#8
182	adc	$hi1,$hi1,xzr
183	ldr	$tj,[$tp],#8		// tp[j]
184	adds	$lo0,$alo,$hi0
185	sub	$j,$j,#8		// j--
186	adc	$hi0,$ahi,xzr
187
188	adds	$lo1,$nlo,$hi1
189	ldr	$nj,[$np],#8
190	adc	$hi1,$nhi,xzr
191
192	mul	$alo,$aj,$m0		// ap[j]*bp[i]
193	adds	$lo0,$lo0,$tj
194	umulh	$ahi,$aj,$m0
195	adc	$hi0,$hi0,xzr
196
197	mul	$nlo,$nj,$m1		// np[j]*m1
198	adds	$lo1,$lo1,$lo0
199	umulh	$nhi,$nj,$m1
200	str	$lo1,[$tp,#-16]		// tp[j-1]
201	cbnz	$j,.Linner
202
203.Linner_skip:
204	ldr	$tj,[$tp],#8		// tp[j]
205	adc	$hi1,$hi1,xzr
206	adds	$lo0,$alo,$hi0
207	sub	$ap,$ap,$num		// rewind $ap
208	adc	$hi0,$ahi,xzr
209
210	adds	$lo1,$nlo,$hi1
211	sub	$np,$np,$num		// rewind $np
212	adcs	$hi1,$nhi,$ovf
213	adc	$ovf,xzr,xzr
214
215	adds	$lo0,$lo0,$tj
216	adc	$hi0,$hi0,xzr
217
218	adds	$lo1,$lo1,$lo0
219	adcs	$hi1,$hi1,$hi0
220	adc	$ovf,$ovf,xzr		// upmost overflow bit
221	stp	$lo1,$hi1,[$tp,#-16]
222
223	cbnz	$i,.Louter
224
225	// Final step. We see if result is larger than modulus, and
226	// if it is, subtract the modulus. But comparison implies
227	// subtraction. So we subtract modulus, see if it borrowed,
228	// and conditionally copy original value.
229	ldr	$tj,[sp]		// tp[0]
230	add	$tp,sp,#8
231	ldr	$nj,[$np],#8		// np[0]
232	subs	$j,$num,#8		// j=num-1 and clear borrow
233	mov	$ap,$rp
234.Lsub:
235	sbcs	$aj,$tj,$nj		// tp[j]-np[j]
236	ldr	$tj,[$tp],#8
237	sub	$j,$j,#8		// j--
238	ldr	$nj,[$np],#8
239	str	$aj,[$ap],#8		// rp[j]=tp[j]-np[j]
240	cbnz	$j,.Lsub
241
242	sbcs	$aj,$tj,$nj
243	sbcs	$ovf,$ovf,xzr		// did it borrow?
244	str	$aj,[$ap],#8		// rp[num-1]
245
246	ldr	$tj,[sp]		// tp[0]
247	add	$tp,sp,#8
248	ldr	$aj,[$rp],#8		// rp[0]
249	sub	$num,$num,#8		// num--
250	nop
251.Lcond_copy:
252	sub	$num,$num,#8		// num--
253	csel	$nj,$tj,$aj,lo		// did it borrow?
254	ldr	$tj,[$tp],#8
255	ldr	$aj,[$rp],#8
256	str	xzr,[$tp,#-16]		// wipe tp
257	str	$nj,[$rp,#-16]
258	cbnz	$num,.Lcond_copy
259
260	csel	$nj,$tj,$aj,lo
261	str	xzr,[$tp,#-8]		// wipe tp
262	str	$nj,[$rp,#-8]
263
264	ldp	x19,x20,[x29,#16]
265	mov	sp,x29
266	ldp	x21,x22,[x29,#32]
267	mov	x0,#1
268	ldp	x23,x24,[x29,#48]
269	ldr	x29,[sp],#64
270	ret
271.size	bn_mul_mont,.-bn_mul_mont
272___
273{
274########################################################################
275# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
276
277my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
278my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
279my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
280my ($cnt,$carry,$topmost)=("x27","x28","x30");
281my ($tp,$ap_end,$na0)=($bp,$np,$carry);
282
283$code.=<<___;
284.type	__bn_sqr8x_mont,%function
285.align	5
286__bn_sqr8x_mont:
287	cmp	$ap,$bp
288	b.ne	__bn_mul4x_mont
289.Lsqr8x_mont:
290	stp	x29,x30,[sp,#-128]!
291	add	x29,sp,#0
292	stp	x19,x20,[sp,#16]
293	stp	x21,x22,[sp,#32]
294	stp	x23,x24,[sp,#48]
295	stp	x25,x26,[sp,#64]
296	stp	x27,x28,[sp,#80]
297	stp	$rp,$np,[sp,#96]	// offload rp and np
298
299	ldp	$a0,$a1,[$ap,#8*0]
300	ldp	$a2,$a3,[$ap,#8*2]
301	ldp	$a4,$a5,[$ap,#8*4]
302	ldp	$a6,$a7,[$ap,#8*6]
303
304	sub	$tp,sp,$num,lsl#4
305	lsl	$num,$num,#3
306	ldr	$n0,[$n0]		// *n0
307	mov	sp,$tp			// alloca
308	sub	$cnt,$num,#8*8
309	b	.Lsqr8x_zero_start
310
311.Lsqr8x_zero:
312	sub	$cnt,$cnt,#8*8
313	stp	xzr,xzr,[$tp,#8*0]
314	stp	xzr,xzr,[$tp,#8*2]
315	stp	xzr,xzr,[$tp,#8*4]
316	stp	xzr,xzr,[$tp,#8*6]
317.Lsqr8x_zero_start:
318	stp	xzr,xzr,[$tp,#8*8]
319	stp	xzr,xzr,[$tp,#8*10]
320	stp	xzr,xzr,[$tp,#8*12]
321	stp	xzr,xzr,[$tp,#8*14]
322	add	$tp,$tp,#8*16
323	cbnz	$cnt,.Lsqr8x_zero
324
325	add	$ap_end,$ap,$num
326	add	$ap,$ap,#8*8
327	mov	$acc0,xzr
328	mov	$acc1,xzr
329	mov	$acc2,xzr
330	mov	$acc3,xzr
331	mov	$acc4,xzr
332	mov	$acc5,xzr
333	mov	$acc6,xzr
334	mov	$acc7,xzr
335	mov	$tp,sp
336	str	$n0,[x29,#112]		// offload n0
337
338	// Multiply everything but a[i]*a[i]
339.align	4
340.Lsqr8x_outer_loop:
341        //                                                 a[1]a[0]	(i)
342        //                                             a[2]a[0]
343        //                                         a[3]a[0]
344        //                                     a[4]a[0]
345        //                                 a[5]a[0]
346        //                             a[6]a[0]
347        //                         a[7]a[0]
348        //                                         a[2]a[1]		(ii)
349        //                                     a[3]a[1]
350        //                                 a[4]a[1]
351        //                             a[5]a[1]
352        //                         a[6]a[1]
353        //                     a[7]a[1]
354        //                                 a[3]a[2]			(iii)
355        //                             a[4]a[2]
356        //                         a[5]a[2]
357        //                     a[6]a[2]
358        //                 a[7]a[2]
359        //                         a[4]a[3]				(iv)
360        //                     a[5]a[3]
361        //                 a[6]a[3]
362        //             a[7]a[3]
363        //                 a[5]a[4]					(v)
364        //             a[6]a[4]
365        //         a[7]a[4]
366        //         a[6]a[5]						(vi)
367        //     a[7]a[5]
368        // a[7]a[6]							(vii)
369
370	mul	$t0,$a1,$a0		// lo(a[1..7]*a[0])		(i)
371	mul	$t1,$a2,$a0
372	mul	$t2,$a3,$a0
373	mul	$t3,$a4,$a0
374	adds	$acc1,$acc1,$t0		// t[1]+lo(a[1]*a[0])
375	mul	$t0,$a5,$a0
376	adcs	$acc2,$acc2,$t1
377	mul	$t1,$a6,$a0
378	adcs	$acc3,$acc3,$t2
379	mul	$t2,$a7,$a0
380	adcs	$acc4,$acc4,$t3
381	umulh	$t3,$a1,$a0		// hi(a[1..7]*a[0])
382	adcs	$acc5,$acc5,$t0
383	umulh	$t0,$a2,$a0
384	adcs	$acc6,$acc6,$t1
385	umulh	$t1,$a3,$a0
386	adcs	$acc7,$acc7,$t2
387	umulh	$t2,$a4,$a0
388	stp	$acc0,$acc1,[$tp],#8*2	// t[0..1]
389	adc	$acc0,xzr,xzr		// t[8]
390	adds	$acc2,$acc2,$t3		// t[2]+lo(a[1]*a[0])
391	umulh	$t3,$a5,$a0
392	adcs	$acc3,$acc3,$t0
393	umulh	$t0,$a6,$a0
394	adcs	$acc4,$acc4,$t1
395	umulh	$t1,$a7,$a0
396	adcs	$acc5,$acc5,$t2
397	 mul	$t2,$a2,$a1		// lo(a[2..7]*a[1])		(ii)
398	adcs	$acc6,$acc6,$t3
399	 mul	$t3,$a3,$a1
400	adcs	$acc7,$acc7,$t0
401	 mul	$t0,$a4,$a1
402	adc	$acc0,$acc0,$t1
403
404	mul	$t1,$a5,$a1
405	adds	$acc3,$acc3,$t2
406	mul	$t2,$a6,$a1
407	adcs	$acc4,$acc4,$t3
408	mul	$t3,$a7,$a1
409	adcs	$acc5,$acc5,$t0
410	umulh	$t0,$a2,$a1		// hi(a[2..7]*a[1])
411	adcs	$acc6,$acc6,$t1
412	umulh	$t1,$a3,$a1
413	adcs	$acc7,$acc7,$t2
414	umulh	$t2,$a4,$a1
415	adcs	$acc0,$acc0,$t3
416	umulh	$t3,$a5,$a1
417	stp	$acc2,$acc3,[$tp],#8*2	// t[2..3]
418	adc	$acc1,xzr,xzr		// t[9]
419	adds	$acc4,$acc4,$t0
420	umulh	$t0,$a6,$a1
421	adcs	$acc5,$acc5,$t1
422	umulh	$t1,$a7,$a1
423	adcs	$acc6,$acc6,$t2
424	 mul	$t2,$a3,$a2		// lo(a[3..7]*a[2])		(iii)
425	adcs	$acc7,$acc7,$t3
426	 mul	$t3,$a4,$a2
427	adcs	$acc0,$acc0,$t0
428	 mul	$t0,$a5,$a2
429	adc	$acc1,$acc1,$t1
430
431	mul	$t1,$a6,$a2
432	adds	$acc5,$acc5,$t2
433	mul	$t2,$a7,$a2
434	adcs	$acc6,$acc6,$t3
435	umulh	$t3,$a3,$a2		// hi(a[3..7]*a[2])
436	adcs	$acc7,$acc7,$t0
437	umulh	$t0,$a4,$a2
438	adcs	$acc0,$acc0,$t1
439	umulh	$t1,$a5,$a2
440	adcs	$acc1,$acc1,$t2
441	umulh	$t2,$a6,$a2
442	stp	$acc4,$acc5,[$tp],#8*2	// t[4..5]
443	adc	$acc2,xzr,xzr		// t[10]
444	adds	$acc6,$acc6,$t3
445	umulh	$t3,$a7,$a2
446	adcs	$acc7,$acc7,$t0
447	 mul	$t0,$a4,$a3		// lo(a[4..7]*a[3])		(iv)
448	adcs	$acc0,$acc0,$t1
449	 mul	$t1,$a5,$a3
450	adcs	$acc1,$acc1,$t2
451	 mul	$t2,$a6,$a3
452	adc	$acc2,$acc2,$t3
453
454	mul	$t3,$a7,$a3
455	adds	$acc7,$acc7,$t0
456	umulh	$t0,$a4,$a3		// hi(a[4..7]*a[3])
457	adcs	$acc0,$acc0,$t1
458	umulh	$t1,$a5,$a3
459	adcs	$acc1,$acc1,$t2
460	umulh	$t2,$a6,$a3
461	adcs	$acc2,$acc2,$t3
462	umulh	$t3,$a7,$a3
463	stp	$acc6,$acc7,[$tp],#8*2	// t[6..7]
464	adc	$acc3,xzr,xzr		// t[11]
465	adds	$acc0,$acc0,$t0
466	 mul	$t0,$a5,$a4		// lo(a[5..7]*a[4])		(v)
467	adcs	$acc1,$acc1,$t1
468	 mul	$t1,$a6,$a4
469	adcs	$acc2,$acc2,$t2
470	 mul	$t2,$a7,$a4
471	adc	$acc3,$acc3,$t3
472
473	umulh	$t3,$a5,$a4		// hi(a[5..7]*a[4])
474	adds	$acc1,$acc1,$t0
475	umulh	$t0,$a6,$a4
476	adcs	$acc2,$acc2,$t1
477	umulh	$t1,$a7,$a4
478	adcs	$acc3,$acc3,$t2
479	 mul	$t2,$a6,$a5		// lo(a[6..7]*a[5])		(vi)
480	adc	$acc4,xzr,xzr		// t[12]
481	adds	$acc2,$acc2,$t3
482	 mul	$t3,$a7,$a5
483	adcs	$acc3,$acc3,$t0
484	 umulh	$t0,$a6,$a5		// hi(a[6..7]*a[5])
485	adc	$acc4,$acc4,$t1
486
487	umulh	$t1,$a7,$a5
488	adds	$acc3,$acc3,$t2
489	 mul	$t2,$a7,$a6		// lo(a[7]*a[6])		(vii)
490	adcs	$acc4,$acc4,$t3
491	 umulh	$t3,$a7,$a6		// hi(a[7]*a[6])
492	adc	$acc5,xzr,xzr		// t[13]
493	adds	$acc4,$acc4,$t0
494	sub	$cnt,$ap_end,$ap	// done yet?
495	adc	$acc5,$acc5,$t1
496
497	adds	$acc5,$acc5,$t2
498	sub	$t0,$ap_end,$num	// rewinded ap
499	adc	$acc6,xzr,xzr		// t[14]
500	add	$acc6,$acc6,$t3
501
502	cbz	$cnt,.Lsqr8x_outer_break
503
504	mov	$n0,$a0
505	ldp	$a0,$a1,[$tp,#8*0]
506	ldp	$a2,$a3,[$tp,#8*2]
507	ldp	$a4,$a5,[$tp,#8*4]
508	ldp	$a6,$a7,[$tp,#8*6]
509	adds	$acc0,$acc0,$a0
510	adcs	$acc1,$acc1,$a1
511	ldp	$a0,$a1,[$ap,#8*0]
512	adcs	$acc2,$acc2,$a2
513	adcs	$acc3,$acc3,$a3
514	ldp	$a2,$a3,[$ap,#8*2]
515	adcs	$acc4,$acc4,$a4
516	adcs	$acc5,$acc5,$a5
517	ldp	$a4,$a5,[$ap,#8*4]
518	adcs	$acc6,$acc6,$a6
519	mov	$rp,$ap
520	adcs	$acc7,xzr,$a7
521	ldp	$a6,$a7,[$ap,#8*6]
522	add	$ap,$ap,#8*8
523	//adc	$carry,xzr,xzr		// moved below
524	mov	$cnt,#-8*8
525
526	//                                                         a[8]a[0]
527	//                                                     a[9]a[0]
528	//                                                 a[a]a[0]
529	//                                             a[b]a[0]
530	//                                         a[c]a[0]
531	//                                     a[d]a[0]
532	//                                 a[e]a[0]
533	//                             a[f]a[0]
534	//                                                     a[8]a[1]
535	//                         a[f]a[1]........................
536	//                                                 a[8]a[2]
537	//                     a[f]a[2]........................
538	//                                             a[8]a[3]
539	//                 a[f]a[3]........................
540	//                                         a[8]a[4]
541	//             a[f]a[4]........................
542	//                                     a[8]a[5]
543	//         a[f]a[5]........................
544	//                                 a[8]a[6]
545	//     a[f]a[6]........................
546	//                             a[8]a[7]
547	// a[f]a[7]........................
548.Lsqr8x_mul:
549	mul	$t0,$a0,$n0
550	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
551	mul	$t1,$a1,$n0
552	add	$cnt,$cnt,#8
553	mul	$t2,$a2,$n0
554	mul	$t3,$a3,$n0
555	adds	$acc0,$acc0,$t0
556	mul	$t0,$a4,$n0
557	adcs	$acc1,$acc1,$t1
558	mul	$t1,$a5,$n0
559	adcs	$acc2,$acc2,$t2
560	mul	$t2,$a6,$n0
561	adcs	$acc3,$acc3,$t3
562	mul	$t3,$a7,$n0
563	adcs	$acc4,$acc4,$t0
564	umulh	$t0,$a0,$n0
565	adcs	$acc5,$acc5,$t1
566	umulh	$t1,$a1,$n0
567	adcs	$acc6,$acc6,$t2
568	umulh	$t2,$a2,$n0
569	adcs	$acc7,$acc7,$t3
570	umulh	$t3,$a3,$n0
571	adc	$carry,$carry,xzr
572	str	$acc0,[$tp],#8
573	adds	$acc0,$acc1,$t0
574	umulh	$t0,$a4,$n0
575	adcs	$acc1,$acc2,$t1
576	umulh	$t1,$a5,$n0
577	adcs	$acc2,$acc3,$t2
578	umulh	$t2,$a6,$n0
579	adcs	$acc3,$acc4,$t3
580	umulh	$t3,$a7,$n0
581	ldr	$n0,[$rp,$cnt]
582	adcs	$acc4,$acc5,$t0
583	adcs	$acc5,$acc6,$t1
584	adcs	$acc6,$acc7,$t2
585	adcs	$acc7,$carry,$t3
586	//adc	$carry,xzr,xzr		// moved above
587	cbnz	$cnt,.Lsqr8x_mul
588					// note that carry flag is guaranteed
589					// to be zero at this point
590	cmp	$ap,$ap_end		// done yet?
591	b.eq	.Lsqr8x_break
592
593	ldp	$a0,$a1,[$tp,#8*0]
594	ldp	$a2,$a3,[$tp,#8*2]
595	ldp	$a4,$a5,[$tp,#8*4]
596	ldp	$a6,$a7,[$tp,#8*6]
597	adds	$acc0,$acc0,$a0
598	ldr	$n0,[$rp,#-8*8]
599	adcs	$acc1,$acc1,$a1
600	ldp	$a0,$a1,[$ap,#8*0]
601	adcs	$acc2,$acc2,$a2
602	adcs	$acc3,$acc3,$a3
603	ldp	$a2,$a3,[$ap,#8*2]
604	adcs	$acc4,$acc4,$a4
605	adcs	$acc5,$acc5,$a5
606	ldp	$a4,$a5,[$ap,#8*4]
607	adcs	$acc6,$acc6,$a6
608	mov	$cnt,#-8*8
609	adcs	$acc7,$acc7,$a7
610	ldp	$a6,$a7,[$ap,#8*6]
611	add	$ap,$ap,#8*8
612	//adc	$carry,xzr,xzr		// moved above
613	b	.Lsqr8x_mul
614
615.align	4
616.Lsqr8x_break:
617	ldp	$a0,$a1,[$rp,#8*0]
618	add	$ap,$rp,#8*8
619	ldp	$a2,$a3,[$rp,#8*2]
620	sub	$t0,$ap_end,$ap		// is it last iteration?
621	ldp	$a4,$a5,[$rp,#8*4]
622	sub	$t1,$tp,$t0
623	ldp	$a6,$a7,[$rp,#8*6]
624	cbz	$t0,.Lsqr8x_outer_loop
625
626	stp	$acc0,$acc1,[$tp,#8*0]
627	ldp	$acc0,$acc1,[$t1,#8*0]
628	stp	$acc2,$acc3,[$tp,#8*2]
629	ldp	$acc2,$acc3,[$t1,#8*2]
630	stp	$acc4,$acc5,[$tp,#8*4]
631	ldp	$acc4,$acc5,[$t1,#8*4]
632	stp	$acc6,$acc7,[$tp,#8*6]
633	mov	$tp,$t1
634	ldp	$acc6,$acc7,[$t1,#8*6]
635	b	.Lsqr8x_outer_loop
636
637.align	4
638.Lsqr8x_outer_break:
639	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
640	ldp	$a1,$a3,[$t0,#8*0]	// recall that $t0 is &a[0]
641	ldp	$t1,$t2,[sp,#8*1]
642	ldp	$a5,$a7,[$t0,#8*2]
643	add	$ap,$t0,#8*4
644	ldp	$t3,$t0,[sp,#8*3]
645
646	stp	$acc0,$acc1,[$tp,#8*0]
647	mul	$acc0,$a1,$a1
648	stp	$acc2,$acc3,[$tp,#8*2]
649	umulh	$a1,$a1,$a1
650	stp	$acc4,$acc5,[$tp,#8*4]
651	mul	$a2,$a3,$a3
652	stp	$acc6,$acc7,[$tp,#8*6]
653	mov	$tp,sp
654	umulh	$a3,$a3,$a3
655	adds	$acc1,$a1,$t1,lsl#1
656	extr	$t1,$t2,$t1,#63
657	sub	$cnt,$num,#8*4
658
659.Lsqr4x_shift_n_add:
660	adcs	$acc2,$a2,$t1
661	extr	$t2,$t3,$t2,#63
662	sub	$cnt,$cnt,#8*4
663	adcs	$acc3,$a3,$t2
664	ldp	$t1,$t2,[$tp,#8*5]
665	mul	$a4,$a5,$a5
666	ldp	$a1,$a3,[$ap],#8*2
667	umulh	$a5,$a5,$a5
668	mul	$a6,$a7,$a7
669	umulh	$a7,$a7,$a7
670	extr	$t3,$t0,$t3,#63
671	stp	$acc0,$acc1,[$tp,#8*0]
672	adcs	$acc4,$a4,$t3
673	extr	$t0,$t1,$t0,#63
674	stp	$acc2,$acc3,[$tp,#8*2]
675	adcs	$acc5,$a5,$t0
676	ldp	$t3,$t0,[$tp,#8*7]
677	extr	$t1,$t2,$t1,#63
678	adcs	$acc6,$a6,$t1
679	extr	$t2,$t3,$t2,#63
680	adcs	$acc7,$a7,$t2
681	ldp	$t1,$t2,[$tp,#8*9]
682	mul	$a0,$a1,$a1
683	ldp	$a5,$a7,[$ap],#8*2
684	umulh	$a1,$a1,$a1
685	mul	$a2,$a3,$a3
686	umulh	$a3,$a3,$a3
687	stp	$acc4,$acc5,[$tp,#8*4]
688	extr	$t3,$t0,$t3,#63
689	stp	$acc6,$acc7,[$tp,#8*6]
690	add	$tp,$tp,#8*8
691	adcs	$acc0,$a0,$t3
692	extr	$t0,$t1,$t0,#63
693	adcs	$acc1,$a1,$t0
694	ldp	$t3,$t0,[$tp,#8*3]
695	extr	$t1,$t2,$t1,#63
696	cbnz	$cnt,.Lsqr4x_shift_n_add
697___
698my ($np,$np_end)=($ap,$ap_end);
699$code.=<<___;
700	 ldp	$np,$n0,[x29,#104]	// pull np and n0
701
702	adcs	$acc2,$a2,$t1
703	extr	$t2,$t3,$t2,#63
704	adcs	$acc3,$a3,$t2
705	ldp	$t1,$t2,[$tp,#8*5]
706	mul	$a4,$a5,$a5
707	umulh	$a5,$a5,$a5
708	stp	$acc0,$acc1,[$tp,#8*0]
709	mul	$a6,$a7,$a7
710	umulh	$a7,$a7,$a7
711	stp	$acc2,$acc3,[$tp,#8*2]
712	extr	$t3,$t0,$t3,#63
713	adcs	$acc4,$a4,$t3
714	extr	$t0,$t1,$t0,#63
715	 ldp	$acc0,$acc1,[sp,#8*0]
716	adcs	$acc5,$a5,$t0
717	extr	$t1,$t2,$t1,#63
718	 ldp	$a0,$a1,[$np,#8*0]
719	adcs	$acc6,$a6,$t1
720	extr	$t2,xzr,$t2,#63
721	 ldp	$a2,$a3,[$np,#8*2]
722	adc	$acc7,$a7,$t2
723	 ldp	$a4,$a5,[$np,#8*4]
724
725	// Reduce by 512 bits per iteration
726	mul	$na0,$n0,$acc0		// t[0]*n0
727	ldp	$a6,$a7,[$np,#8*6]
728	add	$np_end,$np,$num
729	ldp	$acc2,$acc3,[sp,#8*2]
730	stp	$acc4,$acc5,[$tp,#8*4]
731	ldp	$acc4,$acc5,[sp,#8*4]
732	stp	$acc6,$acc7,[$tp,#8*6]
733	ldp	$acc6,$acc7,[sp,#8*6]
734	add	$np,$np,#8*8
735	mov	$topmost,xzr		// initial top-most carry
736	mov	$tp,sp
737	mov	$cnt,#8
738
739.Lsqr8x_reduction:
740	// (*)	mul	$t0,$a0,$na0	// lo(n[0-7])*lo(t[0]*n0)
741	mul	$t1,$a1,$na0
742	sub	$cnt,$cnt,#1
743	mul	$t2,$a2,$na0
744	str	$na0,[$tp],#8		// put aside t[0]*n0 for tail processing
745	mul	$t3,$a3,$na0
746	// (*)	adds	xzr,$acc0,$t0
747	subs	xzr,$acc0,#1		// (*)
748	mul	$t0,$a4,$na0
749	adcs	$acc0,$acc1,$t1
750	mul	$t1,$a5,$na0
751	adcs	$acc1,$acc2,$t2
752	mul	$t2,$a6,$na0
753	adcs	$acc2,$acc3,$t3
754	mul	$t3,$a7,$na0
755	adcs	$acc3,$acc4,$t0
756	umulh	$t0,$a0,$na0		// hi(n[0-7])*lo(t[0]*n0)
757	adcs	$acc4,$acc5,$t1
758	umulh	$t1,$a1,$na0
759	adcs	$acc5,$acc6,$t2
760	umulh	$t2,$a2,$na0
761	adcs	$acc6,$acc7,$t3
762	umulh	$t3,$a3,$na0
763	adc	$acc7,xzr,xzr
764	adds	$acc0,$acc0,$t0
765	umulh	$t0,$a4,$na0
766	adcs	$acc1,$acc1,$t1
767	umulh	$t1,$a5,$na0
768	adcs	$acc2,$acc2,$t2
769	umulh	$t2,$a6,$na0
770	adcs	$acc3,$acc3,$t3
771	umulh	$t3,$a7,$na0
772	mul	$na0,$n0,$acc0		// next t[0]*n0
773	adcs	$acc4,$acc4,$t0
774	adcs	$acc5,$acc5,$t1
775	adcs	$acc6,$acc6,$t2
776	adc	$acc7,$acc7,$t3
777	cbnz	$cnt,.Lsqr8x_reduction
778
779	ldp	$t0,$t1,[$tp,#8*0]
780	ldp	$t2,$t3,[$tp,#8*2]
781	mov	$rp,$tp
782	sub	$cnt,$np_end,$np	// done yet?
783	adds	$acc0,$acc0,$t0
784	adcs	$acc1,$acc1,$t1
785	ldp	$t0,$t1,[$tp,#8*4]
786	adcs	$acc2,$acc2,$t2
787	adcs	$acc3,$acc3,$t3
788	ldp	$t2,$t3,[$tp,#8*6]
789	adcs	$acc4,$acc4,$t0
790	adcs	$acc5,$acc5,$t1
791	adcs	$acc6,$acc6,$t2
792	adcs	$acc7,$acc7,$t3
793	//adc	$carry,xzr,xzr		// moved below
794	cbz	$cnt,.Lsqr8x8_post_condition
795
796	ldr	$n0,[$tp,#-8*8]
797	ldp	$a0,$a1,[$np,#8*0]
798	ldp	$a2,$a3,[$np,#8*2]
799	ldp	$a4,$a5,[$np,#8*4]
800	mov	$cnt,#-8*8
801	ldp	$a6,$a7,[$np,#8*6]
802	add	$np,$np,#8*8
803
804.Lsqr8x_tail:
805	mul	$t0,$a0,$n0
806	adc	$carry,xzr,xzr		// carry bit, modulo-scheduled
807	mul	$t1,$a1,$n0
808	add	$cnt,$cnt,#8
809	mul	$t2,$a2,$n0
810	mul	$t3,$a3,$n0
811	adds	$acc0,$acc0,$t0
812	mul	$t0,$a4,$n0
813	adcs	$acc1,$acc1,$t1
814	mul	$t1,$a5,$n0
815	adcs	$acc2,$acc2,$t2
816	mul	$t2,$a6,$n0
817	adcs	$acc3,$acc3,$t3
818	mul	$t3,$a7,$n0
819	adcs	$acc4,$acc4,$t0
820	umulh	$t0,$a0,$n0
821	adcs	$acc5,$acc5,$t1
822	umulh	$t1,$a1,$n0
823	adcs	$acc6,$acc6,$t2
824	umulh	$t2,$a2,$n0
825	adcs	$acc7,$acc7,$t3
826	umulh	$t3,$a3,$n0
827	adc	$carry,$carry,xzr
828	str	$acc0,[$tp],#8
829	adds	$acc0,$acc1,$t0
830	umulh	$t0,$a4,$n0
831	adcs	$acc1,$acc2,$t1
832	umulh	$t1,$a5,$n0
833	adcs	$acc2,$acc3,$t2
834	umulh	$t2,$a6,$n0
835	adcs	$acc3,$acc4,$t3
836	umulh	$t3,$a7,$n0
837	ldr	$n0,[$rp,$cnt]
838	adcs	$acc4,$acc5,$t0
839	adcs	$acc5,$acc6,$t1
840	adcs	$acc6,$acc7,$t2
841	adcs	$acc7,$carry,$t3
842	//adc	$carry,xzr,xzr		// moved above
843	cbnz	$cnt,.Lsqr8x_tail
844					// note that carry flag is guaranteed
845					// to be zero at this point
846	ldp	$a0,$a1,[$tp,#8*0]
847	sub	$cnt,$np_end,$np	// done yet?
848	sub	$t2,$np_end,$num	// rewinded np
849	ldp	$a2,$a3,[$tp,#8*2]
850	ldp	$a4,$a5,[$tp,#8*4]
851	ldp	$a6,$a7,[$tp,#8*6]
852	cbz	$cnt,.Lsqr8x_tail_break
853
854	ldr	$n0,[$rp,#-8*8]
855	adds	$acc0,$acc0,$a0
856	adcs	$acc1,$acc1,$a1
857	ldp	$a0,$a1,[$np,#8*0]
858	adcs	$acc2,$acc2,$a2
859	adcs	$acc3,$acc3,$a3
860	ldp	$a2,$a3,[$np,#8*2]
861	adcs	$acc4,$acc4,$a4
862	adcs	$acc5,$acc5,$a5
863	ldp	$a4,$a5,[$np,#8*4]
864	adcs	$acc6,$acc6,$a6
865	mov	$cnt,#-8*8
866	adcs	$acc7,$acc7,$a7
867	ldp	$a6,$a7,[$np,#8*6]
868	add	$np,$np,#8*8
869	//adc	$carry,xzr,xzr		// moved above
870	b	.Lsqr8x_tail
871
872.align	4
873.Lsqr8x_tail_break:
874	ldr	$n0,[x29,#112]		// pull n0
875	add	$cnt,$tp,#8*8		// end of current t[num] window
876
877	subs	xzr,$topmost,#1		// "move" top-most carry to carry bit
878	adcs	$t0,$acc0,$a0
879	adcs	$t1,$acc1,$a1
880	ldp	$acc0,$acc1,[$rp,#8*0]
881	adcs	$acc2,$acc2,$a2
882	ldp	$a0,$a1,[$t2,#8*0]	// recall that $t2 is &n[0]
883	adcs	$acc3,$acc3,$a3
884	ldp	$a2,$a3,[$t2,#8*2]
885	adcs	$acc4,$acc4,$a4
886	adcs	$acc5,$acc5,$a5
887	ldp	$a4,$a5,[$t2,#8*4]
888	adcs	$acc6,$acc6,$a6
889	adcs	$acc7,$acc7,$a7
890	ldp	$a6,$a7,[$t2,#8*6]
891	add	$np,$t2,#8*8
892	adc	$topmost,xzr,xzr	// top-most carry
893	mul	$na0,$n0,$acc0
894	stp	$t0,$t1,[$tp,#8*0]
895	stp	$acc2,$acc3,[$tp,#8*2]
896	ldp	$acc2,$acc3,[$rp,#8*2]
897	stp	$acc4,$acc5,[$tp,#8*4]
898	ldp	$acc4,$acc5,[$rp,#8*4]
899	cmp	$cnt,x29		// did we hit the bottom?
900	stp	$acc6,$acc7,[$tp,#8*6]
901	mov	$tp,$rp			// slide the window
902	ldp	$acc6,$acc7,[$rp,#8*6]
903	mov	$cnt,#8
904	b.ne	.Lsqr8x_reduction
905
906	// Final step. We see if result is larger than modulus, and
907	// if it is, subtract the modulus. But comparison implies
908	// subtraction. So we subtract modulus, see if it borrowed,
909	// and conditionally copy original value.
910	ldr	$rp,[x29,#96]		// pull rp
911	add	$tp,$tp,#8*8
912	subs	$t0,$acc0,$a0
913	sbcs	$t1,$acc1,$a1
914	sub	$cnt,$num,#8*8
915	mov	$ap_end,$rp		// $rp copy
916
917.Lsqr8x_sub:
918	sbcs	$t2,$acc2,$a2
919	ldp	$a0,$a1,[$np,#8*0]
920	sbcs	$t3,$acc3,$a3
921	stp	$t0,$t1,[$rp,#8*0]
922	sbcs	$t0,$acc4,$a4
923	ldp	$a2,$a3,[$np,#8*2]
924	sbcs	$t1,$acc5,$a5
925	stp	$t2,$t3,[$rp,#8*2]
926	sbcs	$t2,$acc6,$a6
927	ldp	$a4,$a5,[$np,#8*4]
928	sbcs	$t3,$acc7,$a7
929	ldp	$a6,$a7,[$np,#8*6]
930	add	$np,$np,#8*8
931	ldp	$acc0,$acc1,[$tp,#8*0]
932	sub	$cnt,$cnt,#8*8
933	ldp	$acc2,$acc3,[$tp,#8*2]
934	ldp	$acc4,$acc5,[$tp,#8*4]
935	ldp	$acc6,$acc7,[$tp,#8*6]
936	add	$tp,$tp,#8*8
937	stp	$t0,$t1,[$rp,#8*4]
938	sbcs	$t0,$acc0,$a0
939	stp	$t2,$t3,[$rp,#8*6]
940	add	$rp,$rp,#8*8
941	sbcs	$t1,$acc1,$a1
942	cbnz	$cnt,.Lsqr8x_sub
943
944	sbcs	$t2,$acc2,$a2
945	 mov	$tp,sp
946	 add	$ap,sp,$num
947	 ldp	$a0,$a1,[$ap_end,#8*0]
948	sbcs	$t3,$acc3,$a3
949	stp	$t0,$t1,[$rp,#8*0]
950	sbcs	$t0,$acc4,$a4
951	 ldp	$a2,$a3,[$ap_end,#8*2]
952	sbcs	$t1,$acc5,$a5
953	stp	$t2,$t3,[$rp,#8*2]
954	sbcs	$t2,$acc6,$a6
955	 ldp	$acc0,$acc1,[$ap,#8*0]
956	sbcs	$t3,$acc7,$a7
957	 ldp	$acc2,$acc3,[$ap,#8*2]
958	sbcs	xzr,$topmost,xzr	// did it borrow?
959	ldr	x30,[x29,#8]		// pull return address
960	stp	$t0,$t1,[$rp,#8*4]
961	stp	$t2,$t3,[$rp,#8*6]
962
963	sub	$cnt,$num,#8*4
964.Lsqr4x_cond_copy:
965	sub	$cnt,$cnt,#8*4
966	csel	$t0,$acc0,$a0,lo
967	 stp	xzr,xzr,[$tp,#8*0]
968	csel	$t1,$acc1,$a1,lo
969	ldp	$a0,$a1,[$ap_end,#8*4]
970	ldp	$acc0,$acc1,[$ap,#8*4]
971	csel	$t2,$acc2,$a2,lo
972	 stp	xzr,xzr,[$tp,#8*2]
973	 add	$tp,$tp,#8*4
974	csel	$t3,$acc3,$a3,lo
975	ldp	$a2,$a3,[$ap_end,#8*6]
976	ldp	$acc2,$acc3,[$ap,#8*6]
977	add	$ap,$ap,#8*4
978	stp	$t0,$t1,[$ap_end,#8*0]
979	stp	$t2,$t3,[$ap_end,#8*2]
980	add	$ap_end,$ap_end,#8*4
981	 stp	xzr,xzr,[$ap,#8*0]
982	 stp	xzr,xzr,[$ap,#8*2]
983	cbnz	$cnt,.Lsqr4x_cond_copy
984
985	csel	$t0,$acc0,$a0,lo
986	 stp	xzr,xzr,[$tp,#8*0]
987	csel	$t1,$acc1,$a1,lo
988	 stp	xzr,xzr,[$tp,#8*2]
989	csel	$t2,$acc2,$a2,lo
990	csel	$t3,$acc3,$a3,lo
991	stp	$t0,$t1,[$ap_end,#8*0]
992	stp	$t2,$t3,[$ap_end,#8*2]
993
994	b	.Lsqr8x_done
995
996.align	4
997.Lsqr8x8_post_condition:
998	adc	$carry,xzr,xzr
999	ldr	x30,[x29,#8]		// pull return address
1000	// $acc0-7,$carry hold result, $a0-7 hold modulus
1001	subs	$a0,$acc0,$a0
1002	ldr	$ap,[x29,#96]		// pull rp
1003	sbcs	$a1,$acc1,$a1
1004	 stp	xzr,xzr,[sp,#8*0]
1005	sbcs	$a2,$acc2,$a2
1006	 stp	xzr,xzr,[sp,#8*2]
1007	sbcs	$a3,$acc3,$a3
1008	 stp	xzr,xzr,[sp,#8*4]
1009	sbcs	$a4,$acc4,$a4
1010	 stp	xzr,xzr,[sp,#8*6]
1011	sbcs	$a5,$acc5,$a5
1012	 stp	xzr,xzr,[sp,#8*8]
1013	sbcs	$a6,$acc6,$a6
1014	 stp	xzr,xzr,[sp,#8*10]
1015	sbcs	$a7,$acc7,$a7
1016	 stp	xzr,xzr,[sp,#8*12]
1017	sbcs	$carry,$carry,xzr	// did it borrow?
1018	 stp	xzr,xzr,[sp,#8*14]
1019
1020	// $a0-7 hold result-modulus
1021	csel	$a0,$acc0,$a0,lo
1022	csel	$a1,$acc1,$a1,lo
1023	csel	$a2,$acc2,$a2,lo
1024	csel	$a3,$acc3,$a3,lo
1025	stp	$a0,$a1,[$ap,#8*0]
1026	csel	$a4,$acc4,$a4,lo
1027	csel	$a5,$acc5,$a5,lo
1028	stp	$a2,$a3,[$ap,#8*2]
1029	csel	$a6,$acc6,$a6,lo
1030	csel	$a7,$acc7,$a7,lo
1031	stp	$a4,$a5,[$ap,#8*4]
1032	stp	$a6,$a7,[$ap,#8*6]
1033
1034.Lsqr8x_done:
1035	ldp	x19,x20,[x29,#16]
1036	mov	sp,x29
1037	ldp	x21,x22,[x29,#32]
1038	mov	x0,#1
1039	ldp	x23,x24,[x29,#48]
1040	ldp	x25,x26,[x29,#64]
1041	ldp	x27,x28,[x29,#80]
1042	ldr	x29,[sp],#128
1043	ret
1044.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1045___
1046}
1047
1048{
1049########################################################################
1050# Even though this might look as ARMv8 adaptation of mulx4x_mont from
1051# x86_64-mont5 module, it's different in sense that it performs
1052# reduction 256 bits at a time.
1053
1054my ($a0,$a1,$a2,$a3,
1055    $t0,$t1,$t2,$t3,
1056    $m0,$m1,$m2,$m3,
1057    $acc0,$acc1,$acc2,$acc3,$acc4,
1058    $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1059my  $bp_end=$rp;
1060my  ($carry,$topmost) = ($rp,"x30");
1061
1062$code.=<<___;
1063.type	__bn_mul4x_mont,%function
1064.align	5
1065__bn_mul4x_mont:
1066	stp	x29,x30,[sp,#-128]!
1067	add	x29,sp,#0
1068	stp	x19,x20,[sp,#16]
1069	stp	x21,x22,[sp,#32]
1070	stp	x23,x24,[sp,#48]
1071	stp	x25,x26,[sp,#64]
1072	stp	x27,x28,[sp,#80]
1073
1074	sub	$tp,sp,$num,lsl#3
1075	lsl	$num,$num,#3
1076	ldr	$n0,[$n0]		// *n0
1077	sub	sp,$tp,#8*4		// alloca
1078
1079	add	$t0,$bp,$num
1080	add	$ap_end,$ap,$num
1081	stp	$rp,$t0,[x29,#96]	// offload rp and &b[num]
1082
1083	ldr	$bi,[$bp,#8*0]		// b[0]
1084	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1085	ldp	$a2,$a3,[$ap,#8*2]
1086	add	$ap,$ap,#8*4
1087	mov	$acc0,xzr
1088	mov	$acc1,xzr
1089	mov	$acc2,xzr
1090	mov	$acc3,xzr
1091	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1092	ldp	$m2,$m3,[$np,#8*2]
1093	adds	$np,$np,#8*4		// clear carry bit
1094	mov	$carry,xzr
1095	mov	$cnt,#0
1096	mov	$tp,sp
1097
1098.Loop_mul4x_1st_reduction:
1099	mul	$t0,$a0,$bi		// lo(a[0..3]*b[0])
1100	adc	$carry,$carry,xzr	// modulo-scheduled
1101	mul	$t1,$a1,$bi
1102	add	$cnt,$cnt,#8
1103	mul	$t2,$a2,$bi
1104	and	$cnt,$cnt,#31
1105	mul	$t3,$a3,$bi
1106	adds	$acc0,$acc0,$t0
1107	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[0])
1108	adcs	$acc1,$acc1,$t1
1109	mul	$mi,$acc0,$n0		// t[0]*n0
1110	adcs	$acc2,$acc2,$t2
1111	umulh	$t1,$a1,$bi
1112	adcs	$acc3,$acc3,$t3
1113	umulh	$t2,$a2,$bi
1114	adc	$acc4,xzr,xzr
1115	umulh	$t3,$a3,$bi
1116	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1117	adds	$acc1,$acc1,$t0
1118	// (*)	mul	$t0,$m0,$mi	// lo(n[0..3]*t[0]*n0)
1119	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1120	adcs	$acc2,$acc2,$t1
1121	mul	$t1,$m1,$mi
1122	adcs	$acc3,$acc3,$t2
1123	mul	$t2,$m2,$mi
1124	adc	$acc4,$acc4,$t3		// can't overflow
1125	mul	$t3,$m3,$mi
1126	// (*)	adds	xzr,$acc0,$t0
1127	subs	xzr,$acc0,#1		// (*)
1128	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0)
1129	adcs	$acc0,$acc1,$t1
1130	umulh	$t1,$m1,$mi
1131	adcs	$acc1,$acc2,$t2
1132	umulh	$t2,$m2,$mi
1133	adcs	$acc2,$acc3,$t3
1134	umulh	$t3,$m3,$mi
1135	adcs	$acc3,$acc4,$carry
1136	adc	$carry,xzr,xzr
1137	adds	$acc0,$acc0,$t0
1138	sub	$t0,$ap_end,$ap
1139	adcs	$acc1,$acc1,$t1
1140	adcs	$acc2,$acc2,$t2
1141	adcs	$acc3,$acc3,$t3
1142	//adc	$carry,$carry,xzr
1143	cbnz	$cnt,.Loop_mul4x_1st_reduction
1144
1145	cbz	$t0,.Lmul4x4_post_condition
1146
1147	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1148	ldp	$a2,$a3,[$ap,#8*2]
1149	add	$ap,$ap,#8*4
1150	ldr	$mi,[sp]		// a[0]*n0
1151	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1152	ldp	$m2,$m3,[$np,#8*2]
1153	add	$np,$np,#8*4
1154
1155.Loop_mul4x_1st_tail:
1156	mul	$t0,$a0,$bi		// lo(a[4..7]*b[i])
1157	adc	$carry,$carry,xzr	// modulo-scheduled
1158	mul	$t1,$a1,$bi
1159	add	$cnt,$cnt,#8
1160	mul	$t2,$a2,$bi
1161	and	$cnt,$cnt,#31
1162	mul	$t3,$a3,$bi
1163	adds	$acc0,$acc0,$t0
1164	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[i])
1165	adcs	$acc1,$acc1,$t1
1166	umulh	$t1,$a1,$bi
1167	adcs	$acc2,$acc2,$t2
1168	umulh	$t2,$a2,$bi
1169	adcs	$acc3,$acc3,$t3
1170	umulh	$t3,$a3,$bi
1171	adc	$acc4,xzr,xzr
1172	ldr	$bi,[$bp,$cnt]		// next b[i] (or b[0])
1173	adds	$acc1,$acc1,$t0
1174	mul	$t0,$m0,$mi		// lo(n[4..7]*a[0]*n0)
1175	adcs	$acc2,$acc2,$t1
1176	mul	$t1,$m1,$mi
1177	adcs	$acc3,$acc3,$t2
1178	mul	$t2,$m2,$mi
1179	adc	$acc4,$acc4,$t3		// can't overflow
1180	mul	$t3,$m3,$mi
1181	adds	$acc0,$acc0,$t0
1182	umulh	$t0,$m0,$mi		// hi(n[4..7]*a[0]*n0)
1183	adcs	$acc1,$acc1,$t1
1184	umulh	$t1,$m1,$mi
1185	adcs	$acc2,$acc2,$t2
1186	umulh	$t2,$m2,$mi
1187	adcs	$acc3,$acc3,$t3
1188	adcs	$acc4,$acc4,$carry
1189	umulh	$t3,$m3,$mi
1190	adc	$carry,xzr,xzr
1191	ldr	$mi,[sp,$cnt]		// next t[0]*n0
1192	str	$acc0,[$tp],#8		// result!!!
1193	adds	$acc0,$acc1,$t0
1194	sub	$t0,$ap_end,$ap		// done yet?
1195	adcs	$acc1,$acc2,$t1
1196	adcs	$acc2,$acc3,$t2
1197	adcs	$acc3,$acc4,$t3
1198	//adc	$carry,$carry,xzr
1199	cbnz	$cnt,.Loop_mul4x_1st_tail
1200
1201	sub	$t1,$ap_end,$num	// rewinded $ap
1202	cbz	$t0,.Lmul4x_proceed
1203
1204	ldp	$a0,$a1,[$ap,#8*0]
1205	ldp	$a2,$a3,[$ap,#8*2]
1206	add	$ap,$ap,#8*4
1207	ldp	$m0,$m1,[$np,#8*0]
1208	ldp	$m2,$m3,[$np,#8*2]
1209	add	$np,$np,#8*4
1210	b	.Loop_mul4x_1st_tail
1211
1212.align	5
1213.Lmul4x_proceed:
1214	ldr	$bi,[$bp,#8*4]!		// *++b
1215	adc	$topmost,$carry,xzr
1216	ldp	$a0,$a1,[$t1,#8*0]	// a[0..3]
1217	sub	$np,$np,$num		// rewind np
1218	ldp	$a2,$a3,[$t1,#8*2]
1219	add	$ap,$t1,#8*4
1220
1221	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1222	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1223	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1224	ldp	$acc2,$acc3,[sp,#8*6]
1225
1226	ldp	$m0,$m1,[$np,#8*0]	// n[0..3]
1227	mov	$tp,sp
1228	ldp	$m2,$m3,[$np,#8*2]
1229	adds	$np,$np,#8*4		// clear carry bit
1230	mov	$carry,xzr
1231
1232.align	4
1233.Loop_mul4x_reduction:
1234	mul	$t0,$a0,$bi		// lo(a[0..3]*b[4])
1235	adc	$carry,$carry,xzr	// modulo-scheduled
1236	mul	$t1,$a1,$bi
1237	add	$cnt,$cnt,#8
1238	mul	$t2,$a2,$bi
1239	and	$cnt,$cnt,#31
1240	mul	$t3,$a3,$bi
1241	adds	$acc0,$acc0,$t0
1242	umulh	$t0,$a0,$bi		// hi(a[0..3]*b[4])
1243	adcs	$acc1,$acc1,$t1
1244	mul	$mi,$acc0,$n0		// t[0]*n0
1245	adcs	$acc2,$acc2,$t2
1246	umulh	$t1,$a1,$bi
1247	adcs	$acc3,$acc3,$t3
1248	umulh	$t2,$a2,$bi
1249	adc	$acc4,xzr,xzr
1250	umulh	$t3,$a3,$bi
1251	ldr	$bi,[$bp,$cnt]		// next b[i]
1252	adds	$acc1,$acc1,$t0
1253	// (*)	mul	$t0,$m0,$mi
1254	str	$mi,[$tp],#8		// put aside t[0]*n0 for tail processing
1255	adcs	$acc2,$acc2,$t1
1256	mul	$t1,$m1,$mi		// lo(n[0..3]*t[0]*n0
1257	adcs	$acc3,$acc3,$t2
1258	mul	$t2,$m2,$mi
1259	adc	$acc4,$acc4,$t3		// can't overflow
1260	mul	$t3,$m3,$mi
1261	// (*)	adds	xzr,$acc0,$t0
1262	subs	xzr,$acc0,#1		// (*)
1263	umulh	$t0,$m0,$mi		// hi(n[0..3]*t[0]*n0
1264	adcs	$acc0,$acc1,$t1
1265	umulh	$t1,$m1,$mi
1266	adcs	$acc1,$acc2,$t2
1267	umulh	$t2,$m2,$mi
1268	adcs	$acc2,$acc3,$t3
1269	umulh	$t3,$m3,$mi
1270	adcs	$acc3,$acc4,$carry
1271	adc	$carry,xzr,xzr
1272	adds	$acc0,$acc0,$t0
1273	adcs	$acc1,$acc1,$t1
1274	adcs	$acc2,$acc2,$t2
1275	adcs	$acc3,$acc3,$t3
1276	//adc	$carry,$carry,xzr
1277	cbnz	$cnt,.Loop_mul4x_reduction
1278
1279	adc	$carry,$carry,xzr
1280	ldp	$t0,$t1,[$tp,#8*4]	// t[4..7]
1281	ldp	$t2,$t3,[$tp,#8*6]
1282	ldp	$a0,$a1,[$ap,#8*0]	// a[4..7]
1283	ldp	$a2,$a3,[$ap,#8*2]
1284	add	$ap,$ap,#8*4
1285	adds	$acc0,$acc0,$t0
1286	adcs	$acc1,$acc1,$t1
1287	adcs	$acc2,$acc2,$t2
1288	adcs	$acc3,$acc3,$t3
1289	//adc	$carry,$carry,xzr
1290
1291	ldr	$mi,[sp]		// t[0]*n0
1292	ldp	$m0,$m1,[$np,#8*0]	// n[4..7]
1293	ldp	$m2,$m3,[$np,#8*2]
1294	add	$np,$np,#8*4
1295
1296.align	4
1297.Loop_mul4x_tail:
1298	mul	$t0,$a0,$bi		// lo(a[4..7]*b[4])
1299	adc	$carry,$carry,xzr	// modulo-scheduled
1300	mul	$t1,$a1,$bi
1301	add	$cnt,$cnt,#8
1302	mul	$t2,$a2,$bi
1303	and	$cnt,$cnt,#31
1304	mul	$t3,$a3,$bi
1305	adds	$acc0,$acc0,$t0
1306	umulh	$t0,$a0,$bi		// hi(a[4..7]*b[4])
1307	adcs	$acc1,$acc1,$t1
1308	umulh	$t1,$a1,$bi
1309	adcs	$acc2,$acc2,$t2
1310	umulh	$t2,$a2,$bi
1311	adcs	$acc3,$acc3,$t3
1312	umulh	$t3,$a3,$bi
1313	adc	$acc4,xzr,xzr
1314	ldr	$bi,[$bp,$cnt]		// next b[i]
1315	adds	$acc1,$acc1,$t0
1316	mul	$t0,$m0,$mi		// lo(n[4..7]*t[0]*n0)
1317	adcs	$acc2,$acc2,$t1
1318	mul	$t1,$m1,$mi
1319	adcs	$acc3,$acc3,$t2
1320	mul	$t2,$m2,$mi
1321	adc	$acc4,$acc4,$t3		// can't overflow
1322	mul	$t3,$m3,$mi
1323	adds	$acc0,$acc0,$t0
1324	umulh	$t0,$m0,$mi		// hi(n[4..7]*t[0]*n0)
1325	adcs	$acc1,$acc1,$t1
1326	umulh	$t1,$m1,$mi
1327	adcs	$acc2,$acc2,$t2
1328	umulh	$t2,$m2,$mi
1329	adcs	$acc3,$acc3,$t3
1330	umulh	$t3,$m3,$mi
1331	adcs	$acc4,$acc4,$carry
1332	ldr	$mi,[sp,$cnt]		// next a[0]*n0
1333	adc	$carry,xzr,xzr
1334	str	$acc0,[$tp],#8		// result!!!
1335	adds	$acc0,$acc1,$t0
1336	sub	$t0,$ap_end,$ap		// done yet?
1337	adcs	$acc1,$acc2,$t1
1338	adcs	$acc2,$acc3,$t2
1339	adcs	$acc3,$acc4,$t3
1340	//adc	$carry,$carry,xzr
1341	cbnz	$cnt,.Loop_mul4x_tail
1342
1343	sub	$t1,$np,$num		// rewinded np?
1344	adc	$carry,$carry,xzr
1345	cbz	$t0,.Loop_mul4x_break
1346
1347	ldp	$t0,$t1,[$tp,#8*4]
1348	ldp	$t2,$t3,[$tp,#8*6]
1349	ldp	$a0,$a1,[$ap,#8*0]
1350	ldp	$a2,$a3,[$ap,#8*2]
1351	add	$ap,$ap,#8*4
1352	adds	$acc0,$acc0,$t0
1353	adcs	$acc1,$acc1,$t1
1354	adcs	$acc2,$acc2,$t2
1355	adcs	$acc3,$acc3,$t3
1356	//adc	$carry,$carry,xzr
1357	ldp	$m0,$m1,[$np,#8*0]
1358	ldp	$m2,$m3,[$np,#8*2]
1359	add	$np,$np,#8*4
1360	b	.Loop_mul4x_tail
1361
1362.align	4
1363.Loop_mul4x_break:
1364	ldp	$t2,$t3,[x29,#96]	// pull rp and &b[num]
1365	adds	$acc0,$acc0,$topmost
1366	add	$bp,$bp,#8*4		// bp++
1367	adcs	$acc1,$acc1,xzr
1368	sub	$ap,$ap,$num		// rewind ap
1369	adcs	$acc2,$acc2,xzr
1370	stp	$acc0,$acc1,[$tp,#8*0]	// result!!!
1371	adcs	$acc3,$acc3,xzr
1372	ldp	$acc0,$acc1,[sp,#8*4]	// t[0..3]
1373	adc	$topmost,$carry,xzr
1374	stp	$acc2,$acc3,[$tp,#8*2]	// result!!!
1375	cmp	$bp,$t3			// done yet?
1376	ldp	$acc2,$acc3,[sp,#8*6]
1377	ldp	$m0,$m1,[$t1,#8*0]	// n[0..3]
1378	ldp	$m2,$m3,[$t1,#8*2]
1379	add	$np,$t1,#8*4
1380	b.eq	.Lmul4x_post
1381
1382	ldr	$bi,[$bp]
1383	ldp	$a0,$a1,[$ap,#8*0]	// a[0..3]
1384	ldp	$a2,$a3,[$ap,#8*2]
1385	adds	$ap,$ap,#8*4		// clear carry bit
1386	mov	$carry,xzr
1387	mov	$tp,sp
1388	b	.Loop_mul4x_reduction
1389
1390.align	4
1391.Lmul4x_post:
1392	// Final step. We see if result is larger than modulus, and
1393	// if it is, subtract the modulus. But comparison implies
1394	// subtraction. So we subtract modulus, see if it borrowed,
1395	// and conditionally copy original value.
1396	mov	$rp,$t2
1397	mov	$ap_end,$t2		// $rp copy
1398	subs	$t0,$acc0,$m0
1399	add	$tp,sp,#8*8
1400	sbcs	$t1,$acc1,$m1
1401	sub	$cnt,$num,#8*4
1402
1403.Lmul4x_sub:
1404	sbcs	$t2,$acc2,$m2
1405	ldp	$m0,$m1,[$np,#8*0]
1406	sub	$cnt,$cnt,#8*4
1407	ldp	$acc0,$acc1,[$tp,#8*0]
1408	sbcs	$t3,$acc3,$m3
1409	ldp	$m2,$m3,[$np,#8*2]
1410	add	$np,$np,#8*4
1411	ldp	$acc2,$acc3,[$tp,#8*2]
1412	add	$tp,$tp,#8*4
1413	stp	$t0,$t1,[$rp,#8*0]
1414	sbcs	$t0,$acc0,$m0
1415	stp	$t2,$t3,[$rp,#8*2]
1416	add	$rp,$rp,#8*4
1417	sbcs	$t1,$acc1,$m1
1418	cbnz	$cnt,.Lmul4x_sub
1419
1420	sbcs	$t2,$acc2,$m2
1421	 mov	$tp,sp
1422	 add	$ap,sp,#8*4
1423	 ldp	$a0,$a1,[$ap_end,#8*0]
1424	sbcs	$t3,$acc3,$m3
1425	stp	$t0,$t1,[$rp,#8*0]
1426	 ldp	$a2,$a3,[$ap_end,#8*2]
1427	stp	$t2,$t3,[$rp,#8*2]
1428	 ldp	$acc0,$acc1,[$ap,#8*0]
1429	 ldp	$acc2,$acc3,[$ap,#8*2]
1430	sbcs	xzr,$topmost,xzr	// did it borrow?
1431	ldr	x30,[x29,#8]		// pull return address
1432
1433	sub	$cnt,$num,#8*4
1434.Lmul4x_cond_copy:
1435	sub	$cnt,$cnt,#8*4
1436	csel	$t0,$acc0,$a0,lo
1437	 stp	xzr,xzr,[$tp,#8*0]
1438	csel	$t1,$acc1,$a1,lo
1439	ldp	$a0,$a1,[$ap_end,#8*4]
1440	ldp	$acc0,$acc1,[$ap,#8*4]
1441	csel	$t2,$acc2,$a2,lo
1442	 stp	xzr,xzr,[$tp,#8*2]
1443	 add	$tp,$tp,#8*4
1444	csel	$t3,$acc3,$a3,lo
1445	ldp	$a2,$a3,[$ap_end,#8*6]
1446	ldp	$acc2,$acc3,[$ap,#8*6]
1447	add	$ap,$ap,#8*4
1448	stp	$t0,$t1,[$ap_end,#8*0]
1449	stp	$t2,$t3,[$ap_end,#8*2]
1450	add	$ap_end,$ap_end,#8*4
1451	cbnz	$cnt,.Lmul4x_cond_copy
1452
1453	csel	$t0,$acc0,$a0,lo
1454	 stp	xzr,xzr,[$tp,#8*0]
1455	csel	$t1,$acc1,$a1,lo
1456	 stp	xzr,xzr,[$tp,#8*2]
1457	csel	$t2,$acc2,$a2,lo
1458	 stp	xzr,xzr,[$tp,#8*3]
1459	csel	$t3,$acc3,$a3,lo
1460	 stp	xzr,xzr,[$tp,#8*4]
1461	stp	$t0,$t1,[$ap_end,#8*0]
1462	stp	$t2,$t3,[$ap_end,#8*2]
1463
1464	b	.Lmul4x_done
1465
1466.align	4
1467.Lmul4x4_post_condition:
1468	adc	$carry,$carry,xzr
1469	ldr	$ap,[x29,#96]		// pull rp
1470	// $acc0-3,$carry hold result, $m0-7 hold modulus
1471	subs	$a0,$acc0,$m0
1472	ldr	x30,[x29,#8]		// pull return address
1473	sbcs	$a1,$acc1,$m1
1474	 stp	xzr,xzr,[sp,#8*0]
1475	sbcs	$a2,$acc2,$m2
1476	 stp	xzr,xzr,[sp,#8*2]
1477	sbcs	$a3,$acc3,$m3
1478	 stp	xzr,xzr,[sp,#8*4]
1479	sbcs	xzr,$carry,xzr		// did it borrow?
1480	 stp	xzr,xzr,[sp,#8*6]
1481
1482	// $a0-3 hold result-modulus
1483	csel	$a0,$acc0,$a0,lo
1484	csel	$a1,$acc1,$a1,lo
1485	csel	$a2,$acc2,$a2,lo
1486	csel	$a3,$acc3,$a3,lo
1487	stp	$a0,$a1,[$ap,#8*0]
1488	stp	$a2,$a3,[$ap,#8*2]
1489
1490.Lmul4x_done:
1491	ldp	x19,x20,[x29,#16]
1492	mov	sp,x29
1493	ldp	x21,x22,[x29,#32]
1494	mov	x0,#1
1495	ldp	x23,x24,[x29,#48]
1496	ldp	x25,x26,[x29,#64]
1497	ldp	x27,x28,[x29,#80]
1498	ldr	x29,[sp],#128
1499	ret
1500.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1501___
1502}
1503$code.=<<___;
1504.asciz	"Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1505.align	4
1506___
1507
1508print $code;
1509
1510close STDOUT;
1511